In [5]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels.api as sample_data
import statsmodels.api as sm

from  matplotlib.ticker import PercentFormatter

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

Load The Data¶

In [3]:
# load the data into a pandas dataframe

data_path = '/content/drive/MyDrive/organics.csv'
df = pd.read_csv(data_path)

# preview data
df.info()
df.head
print(df)

# total count of NaN values
print(df.isnull().sum())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               22223 non-null  int64  
 1   DemAffl          21138 non-null  float64
 2   DemAge           20715 non-null  float64
 3   DemCluster       21549 non-null  float64
 4   DemClusterGroup  21549 non-null  object 
 5   DemGender        19711 non-null  object 
 6   DemReg           21758 non-null  object 
 7   DemTVReg         21758 non-null  object 
 8   PromClass        22223 non-null  object 
 9   PromSpend        22223 non-null  float64
 10  PromTime         21942 non-null  float64
 11  TargetBuy        22223 non-null  int64  
 12  TargetAmt        22223 non-null  int64  
dtypes: float64(5), int64(3), object(5)
memory usage: 2.2+ MB
             ID  DemAffl  DemAge  DemCluster DemClusterGroup DemGender  \
0           140     10.0    76.0        16.0               C         U   
1           620      4.0    49.0        35.0               D         U   
2           868      5.0    70.0        27.0               D         F   
3          1120     10.0    65.0        51.0               F         M   
4          2313     11.0    68.0         4.0               A         F   
...         ...      ...     ...         ...             ...       ...   
22218  52834058     13.0    65.0         9.0               B         F   
22219  52834376     15.0    73.0        34.0               D         U   
22220  52837057      9.0    70.0        15.0               B         F   
22221  52838096     11.0    66.0         8.0               B         F   
22222  52856469      3.0    75.0        53.0               F       NaN   

           DemReg      DemTVReg PromClass  PromSpend  PromTime  TargetBuy  \
0        Midlands  Wales & West      Gold   16000.00       4.0          0   
1        Midlands  Wales & West      Gold    6000.00       5.0          0   
2        Midlands  Wales & West    Silver       0.02       8.0          1   
3        Midlands      Midlands       Tin       0.01       7.0          1   
4        Midlands      Midlands       Tin       0.01       8.0          0   
...           ...           ...       ...        ...       ...        ...   
22218  South East        London    Silver    1500.00       5.0          0   
22219  South East    S & S East      Gold    6053.06      12.0          0   
22220       North     Yorkshire      Gold    6000.00       5.0          0   
22221       North        N West    Silver    5000.00       5.0          0   
22222  South West        S West      Gold    6000.00       2.0          0   

       TargetAmt  
0              0  
1              0  
2              1  
3              1  
4              0  
...          ...  
22218          0  
22219          0  
22220          0  
22221          0  
22222          0  

[22223 rows x 13 columns]
ID                    0
DemAffl            1085
DemAge             1508
DemCluster          674
DemClusterGroup     674
DemGender          2512
DemReg              465
DemTVReg            465
PromClass             0
PromSpend             0
PromTime            281
TargetBuy             0
TargetAmt             0
dtype: int64

Drop Or Replace Missing Values¶

In [ ]:
#Replace missing values in gender with unspecified which is a category that
#already exists in this data
df['DemGender'].fillna('U', inplace=True)
df.dropna(subset='DemAge', inplace=True)
print(df.isnull().sum())
ID                    0
DemAffl            1001
DemAge                0
DemCluster          620
DemClusterGroup     620
DemGender             0
DemReg              432
DemTVReg            432
PromClass             0
PromSpend             0
PromTime            259
TargetBuy             0
TargetAmt             0
dtype: int64
In [ ]:
#Drop customers with missing values in the affluence grade category
#See how this affects missing values in the other categories
df.dropna(subset='DemAffl', inplace=True)
print(df.isnull().sum())
ID                   0
DemAffl              0
DemAge               0
DemCluster         585
DemClusterGroup    585
DemGender            0
DemReg             411
DemTVReg           411
PromClass            0
PromSpend            0
PromTime           243
TargetBuy            0
TargetAmt            0
dtype: int64
In [ ]:
#Drop customers with missing values in the residental neigborhood type category
#See how this affects missing values in the other categories
df.dropna(subset='DemCluster', inplace=True)
print(df.isnull().sum())
ID                   0
DemAffl              0
DemAge               0
DemCluster           0
DemClusterGroup      0
DemGender            0
DemReg             399
DemTVReg           399
PromClass            0
PromSpend            0
PromTime           225
TargetBuy            0
TargetAmt            0
dtype: int64
In [ ]:
#See how many categories are in the geographic and television region to
#see if they would be detrimental to include in machine learning models
#This affects whether we should drop customers with data missing in
#these columns
print(df['DemReg'].nunique())
print(df['DemTVReg'].nunique())
5
12
In [ ]:
#Drop customers with missing values in the geographic region category
#See how this affects missing values in the other categories
df.dropna(subset='DemReg', inplace=True)
print(df.isnull().sum())
ID                   0
DemAffl              0
DemAge               0
DemCluster           0
DemClusterGroup      0
DemGender            0
DemReg               0
DemTVReg             0
PromClass            0
PromSpend            0
PromTime           220
TargetBuy            0
TargetAmt            0
dtype: int64
In [ ]:
#Drop customers with missing values in the category about how much time they
#were a loyalty member for
#See how this affects missing values in the other categories
df.dropna(subset='PromTime', inplace=True)
print(df.isnull().sum())
ID                 0
DemAffl            0
DemAge             0
DemCluster         0
DemClusterGroup    0
DemGender          0
DemReg             0
DemTVReg           0
PromClass          0
PromSpend          0
PromTime           0
TargetBuy          0
TargetAmt          0
dtype: int64

Preprocess The Data¶

In [ ]:
#Make categorical columns into numerical dummies where each category is replaced
#by a number starting from 0
def dummyize(x, columns):
    for column in range(0,len(columns)):
        if x == columns[column]:
            return column
In [ ]:
#Make the categorical columns into dummies using the dummyize function so they can be used in a decision tree and logistic regression models
df["DemGender"] = df["DemGender"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemGender']).columns))
df["DemClusterGroup"] = df["DemClusterGroup"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemClusterGroup']).columns))
df["DemReg"] = df["DemReg"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemReg']).columns))
df["PromClass"] = df["PromClass"].apply(lambda x: dummyize(x, pd.get_dummies(df['PromClass']).columns))
In [ ]:
#Drop columns that would be detrimental to machine learning
#ID is the unique identificator for each customer
#TargetAmt gives away if a customer brought organic products
#DemTVReg has too many categories
df.drop(['DemTVReg', 'TargetAmt', 'ID'], axis=1, inplace=True)
In [ ]:
#Make TargetBuy of type category so it can be used in training models
df['TargetBuy'] = df['TargetBuy'].astype('category')
In [ ]:
#Split the data on what the model is learning to predict, whether the customer
#brought organic products from the rest of the data
X = df.drop('TargetBuy', axis=1)

y = df['TargetBuy']

#Split the data into training and test sets to be able to train and compare models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=77)

X_train.info()
y_train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 12957 entries, 779 to 18057
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   DemAffl          12957 non-null  float64
 1   DemAge           12957 non-null  float64
 2   DemCluster       12957 non-null  float64
 3   DemClusterGroup  12957 non-null  int64  
 4   DemGender        12957 non-null  int64  
 5   DemReg           12957 non-null  int64  
 6   PromClass        12957 non-null  int64  
 7   PromSpend        12957 non-null  float64
 8   PromTime         12957 non-null  float64
dtypes: float64(5), int64(4)
memory usage: 1012.3 KB
<class 'pandas.core.series.Series'>
Int64Index: 12957 entries, 779 to 18057
Series name: TargetBuy
Non-Null Count  Dtype   
--------------  -----   
12957 non-null  category
dtypes: category(1)
memory usage: 114.0 KB

Default Tree¶

In [ ]:
# create an instance of a decision tree classifier using default values

dt = DecisionTreeClassifier(max_depth = 25, min_samples_leaf=10, ccp_alpha = 0.001)

# fit the model to the training data
dt.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=25, min_samples_leaf=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=25, min_samples_leaf=10)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt, filled=True, rounded=True, feature_names=X.columns, class_names=['0', '1'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_default")
Out[ ]:
'decision_tree_default.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt,
                'tree.dot',
                class_names=['0','1'],
                feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7ff4593df2b0>
In [ ]:
# make predictions on the training and test data
y_pred_train = dt.predict(X_train)
y_pred_test = dt.predict(X_test)

y_prob_train = dt.predict_proba(X_train)
y_prob_test = dt.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train)
rec_train = recall_score(y_train, y_pred_train)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall.  : {:.4f}".format(rec_train))
print("")

# calculate the accuracy, precision, and recall scores for the test set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test)
rec_test = recall_score(y_test, y_pred_test)

# print the scores for the test set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall.  : {:.4f}".format(rec_test))
 -- train set -- 
Accuracy : 0.8210
Precision: 0.7204
Recall.  : 0.4346

 -- test set -- 
Accuracy : 0.8131
Precision: 0.6944
Recall.  : 0.4138
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)

# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[9265 1786]
 [ 533 1373]]
[[3956  792]
 [ 246  559]]
In [ ]:
# Generate ROC curve for training data

fpr, tpr, thresholds = roc_curve(y_train.cat.codes, y_prob_train[:,1])
roc_auc = roc_auc_score(y_train.cat.codes, y_prob_train[:,1])

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
In [ ]:
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, y_prob_test[:,1])
roc_auc = roc_auc_score(y_test.cat.codes, y_prob_test[:,1])

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
In [ ]:
#calculate feature importance
tree_imp = dt.feature_importances_

#create a data frame with feature names

# creating a list of column names
column_values = ['importance']

# creating the dataframe
df = pd.DataFrame(data = tree_imp,
                  columns = column_values)
df['feature']=X.columns

#sort data so features with largest importance values are at the top
df2 = df.sort_values(by=['importance'], ascending=False)

print(df2)

#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df2,
            y=df2['feature'],
            x=df2['importance'], color="lightblue")

plt.show()
   importance          feature
1    0.506989           DemAge
0    0.341903          DemAffl
4    0.151108        DemGender
2    0.000000       DemCluster
3    0.000000  DemClusterGroup
5    0.000000           DemReg
6    0.000000        PromClass
7    0.000000        PromSpend
8    0.000000         PromTime

Tuned Tree¶

In [ ]:
#Use a grid search with a decision tree to determine which parameters obatin the
#best scores on the training set so we have "tuned" parameters or values
dt_tune = DecisionTreeClassifier()

param_grid = {
    'max_depth': [None, 5, 10, 15, 20, 25],
    'min_samples_leaf': [1, 10, 20, 50, 100],
    'ccp_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1]
}

grid_search = GridSearchCV(dt_tune, param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print(best_params)
print(best_estimator)
{'ccp_alpha': 0, 'max_depth': 5, 'min_samples_leaf': 10}
DecisionTreeClassifier(ccp_alpha=0, max_depth=5, min_samples_leaf=10)
In [ ]:
# create an instance of a decision tree classifier using "tuned" values

dt_tuned = DecisionTreeClassifier(max_depth = 25, min_samples_leaf=10, ccp_alpha = 0)

# fit the model to the training data
dt_tuned.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0, max_depth=25, min_samples_leaf=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0, max_depth=25, min_samples_leaf=10)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_tuned, filled=True, rounded=True, feature_names=X.columns, class_names=['0', '1'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_tuned")
Out[ ]:
'decision_tree_tuned.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_tuned,
                'tree.dot',
                class_names=['0','1'],
                feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.721089 to fit
Out[ ]:
<matplotlib.image.AxesImage at 0x7ff45e445600>
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_tuned.predict(X_train)
y_pred_test = dt_tuned.predict(X_test)

y_prob_train = dt_tuned.predict_proba(X_train)
y_prob_test = dt_tuned.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train)
rec_train = recall_score(y_train, y_pred_train)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall.  : {:.4f}".format(rec_train))
print("")

# calculate the accuracy, precision, and recall scores for the testing set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test)
rec_test = recall_score(y_test, y_pred_test)

# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall.  : {:.4f}".format(rec_test))
 -- train set -- 
Accuracy : 0.8567
Precision: 0.7763
Recall.  : 0.5790

 -- test set -- 
Accuracy : 0.7936
Precision: 0.6060
Recall.  : 0.4338
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)

# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[9271 1330]
 [ 527 1829]]
[[3821  765]
 [ 381  586]]
In [ ]:
# Generate ROC curve for training data

fpr, tpr, thresholds = roc_curve(y_train.cat.codes, y_prob_train[:,1])
roc_auc = roc_auc_score(y_train.cat.codes, y_prob_train[:,1])

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
In [ ]:
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, y_prob_test[:,1])
roc_auc = roc_auc_score(y_test.cat.codes, y_prob_test[:,1])

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
In [ ]:
#calculate feature importance
tree_imp = dt_tuned.feature_importances_

#create a data frame with feature names

# creating a list of column names
column_values = ['importance']

# creating the dataframe
df = pd.DataFrame(data = tree_imp,
                  columns = column_values)
df['feature']=X.columns

#sort data so features with largest importance values are at the top
df2 = df.sort_values(by=['importance'], ascending=False)

print(df2)

#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df2,
            y=df2['feature'],
            x=df2['importance'], color="lightblue")

plt.show()
   importance          feature
1    0.394919           DemAge
0    0.260241          DemAffl
4    0.105844        DemGender
2    0.084088       DemCluster
7    0.063271        PromSpend
8    0.049167         PromTime
5    0.029433           DemReg
6    0.007883        PromClass
3    0.005154  DemClusterGroup

Default Tree, More Data¶

Reload the data¶

In [4]:
#Reload the data to include data from customers who were dropped for missing
#data in categories that were not used in the previous tree models
df = pd.read_csv(data_path)

# preview data
df.info()
df.head
print(df)

# total count of NaN values
print(df.isnull().sum())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               22223 non-null  int64  
 1   DemAffl          21138 non-null  float64
 2   DemAge           20715 non-null  float64
 3   DemCluster       21549 non-null  float64
 4   DemClusterGroup  21549 non-null  object 
 5   DemGender        19711 non-null  object 
 6   DemReg           21758 non-null  object 
 7   DemTVReg         21758 non-null  object 
 8   PromClass        22223 non-null  object 
 9   PromSpend        22223 non-null  float64
 10  PromTime         21942 non-null  float64
 11  TargetBuy        22223 non-null  int64  
 12  TargetAmt        22223 non-null  int64  
dtypes: float64(5), int64(3), object(5)
memory usage: 2.2+ MB
             ID  DemAffl  DemAge  DemCluster DemClusterGroup DemGender  \
0           140     10.0    76.0        16.0               C         U   
1           620      4.0    49.0        35.0               D         U   
2           868      5.0    70.0        27.0               D         F   
3          1120     10.0    65.0        51.0               F         M   
4          2313     11.0    68.0         4.0               A         F   
...         ...      ...     ...         ...             ...       ...   
22218  52834058     13.0    65.0         9.0               B         F   
22219  52834376     15.0    73.0        34.0               D         U   
22220  52837057      9.0    70.0        15.0               B         F   
22221  52838096     11.0    66.0         8.0               B         F   
22222  52856469      3.0    75.0        53.0               F       NaN   

           DemReg      DemTVReg PromClass  PromSpend  PromTime  TargetBuy  \
0        Midlands  Wales & West      Gold   16000.00       4.0          0   
1        Midlands  Wales & West      Gold    6000.00       5.0          0   
2        Midlands  Wales & West    Silver       0.02       8.0          1   
3        Midlands      Midlands       Tin       0.01       7.0          1   
4        Midlands      Midlands       Tin       0.01       8.0          0   
...           ...           ...       ...        ...       ...        ...   
22218  South East        London    Silver    1500.00       5.0          0   
22219  South East    S & S East      Gold    6053.06      12.0          0   
22220       North     Yorkshire      Gold    6000.00       5.0          0   
22221       North        N West    Silver    5000.00       5.0          0   
22222  South West        S West      Gold    6000.00       2.0          0   

       TargetAmt  
0              0  
1              0  
2              1  
3              1  
4              0  
...          ...  
22218          0  
22219          0  
22220          0  
22221          0  
22222          0  

[22223 rows x 13 columns]
ID                    0
DemAffl            1085
DemAge             1508
DemCluster          674
DemClusterGroup     674
DemGender          2512
DemReg              465
DemTVReg            465
PromClass             0
PromSpend             0
PromTime            281
TargetBuy             0
TargetAmt             0
dtype: int64

Drop Or Replace Missing Values¶

In [5]:
#Replace missing values in gender with unspecified which is a category that
#already exists in this data
df['DemGender'].fillna('U', inplace=True)
df.dropna(subset=['DemAge', 'DemAffl'], inplace=True)
print(df.isnull().sum())
ID                   0
DemAffl              0
DemAge               0
DemCluster         585
DemClusterGroup    585
DemGender            0
DemReg             411
DemTVReg           411
PromClass            0
PromSpend            0
PromTime           243
TargetBuy            0
TargetAmt            0
dtype: int64

Preprocess The Data¶

In [6]:
#Drop the columns that were not used in the previous models so we can use the
#data from customers with values missing in those columns
#
#Drop columns that would be detrimental to machine learning
#ID is the unique identificator for each customer
#TargetAmt gives away if a customer brought organic products
#DemTVReg has too many categories
df.drop(['DemCluster', 'DemClusterGroup', 'DemReg', 'DemTVReg', 'PromTime', 'ID', 'TargetAmt'], axis=1, inplace=True)
print(df.isnull().sum())
DemAffl      0
DemAge       0
DemGender    0
PromClass    0
PromSpend    0
TargetBuy    0
dtype: int64
In [ ]:
#Make the categorical columns into dummies using the dummyize function so they can be used in a decision tree and logistic regression models
df["DemGender"] = df["DemGender"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemGender']).columns))
df["PromClass"] = df["PromClass"].apply(lambda x: dummyize(x, pd.get_dummies(df['PromClass']).columns))
In [ ]:
#Make TargetBuy of type category so it can be used in training models
df['TargetBuy'] = df['TargetBuy'].astype('category')
In [ ]:
#Split the data on what the model is learning to predict, whether the customer
#brought organic products from the rest of the data
X = df.drop('TargetBuy', axis=1)

y = df['TargetBuy']

#Split the data into training and test sets to be able to train and compare models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=77)

X_train.info()
y_train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 13799 entries, 9631 to 16966
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   DemAffl    13799 non-null  float64
 1   DemAge     13799 non-null  float64
 2   DemGender  13799 non-null  int64  
 3   PromClass  13799 non-null  int64  
 4   PromSpend  13799 non-null  float64
dtypes: float64(3), int64(2)
memory usage: 646.8 KB
<class 'pandas.core.series.Series'>
Int64Index: 13799 entries, 9631 to 16966
Series name: TargetBuy
Non-Null Count  Dtype   
--------------  -----   
13799 non-null  category
dtypes: category(1)
memory usage: 121.4 KB

Create And Assess The Model¶

In [ ]:
# create an instance of a decision tree classifier using default values

dt_more_data = DecisionTreeClassifier(max_depth = 25, min_samples_leaf=10, ccp_alpha = 0.001)

# fit the model to the training data
dt_more_data.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=25, min_samples_leaf=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=25, min_samples_leaf=10)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_more_data, filled=True, rounded=True, feature_names=X.columns, class_names=['0', '1'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_default_more_data")
Out[ ]:
'decision_tree_default_more_data.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_more_data,
                'tree.dot',
                class_names=['0','1'],
                feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7ff458e69b10>
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_more_data.predict(X_train)
y_pred_test = dt_more_data.predict(X_test)

y_prob_train = dt_more_data.predict_proba(X_train)
y_prob_test = dt_more_data.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train)
rec_train = recall_score(y_train, y_pred_train)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall.  : {:.4f}".format(rec_train))
print("")

# calculate the accuracy, precision, and recall scores for the testing set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test)
rec_test = recall_score(y_test, y_pred_test)

# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall.  : {:.4f}".format(rec_test))
 -- train set -- 
Accuracy : 0.8147
Precision: 0.6517
Recall.  : 0.5543

 -- test set -- 
Accuracy : 0.8213
Precision: 0.6367
Recall.  : 0.5631
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)

# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[9332 1536]
 [1021 1910]]
[[4073  609]
 [ 448  785]]
In [ ]:
# Generate ROC curve for training data

fpr, tpr, thresholds = roc_curve(y_train.cat.codes, y_prob_train[:,1])
roc_auc = roc_auc_score(y_train.cat.codes, y_prob_train[:,1])

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
In [ ]:
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, y_prob_test[:,1])
roc_auc = roc_auc_score(y_test.cat.codes, y_prob_test[:,1])

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
In [ ]:
#calculate feature importance
tree_imp = dt_more_data.feature_importances_

#create a data frame with feature names

# creating a list of column names
column_values = ['importance']

# creating the dataframe
df = pd.DataFrame(data = tree_imp,
                  columns = column_values)
df['feature']=X.columns

#sort data so features with largest importance values are at the top
df2 = df.sort_values(by=['importance'], ascending=False)

print(df2)

#Create variable importance plot
plt.figure(figsize=(40, 20))
plt.title('Variable Importance', fontsize=18)
plt.xlabel('Importance', fontsize=18)
plt.ylabel('Feature', fontsize=18)
plt.tick_params(axis='x', which='major', labelsize=18)
plt.tick_params(axis='y', which='major', labelsize=25)
sns.barplot(data=df2,
            y=df2['feature'],
            x=df2['importance'], color="lightblue")

plt.show()
   importance    feature
1    0.512657     DemAge
0    0.337917    DemAffl
2    0.149426  DemGender
3    0.000000  PromClass
4    0.000000  PromSpend

Tuned Tree, More Data¶

In [ ]:
#Use a grid search with a decision tree to determine which parameters obatin the
#best scores on the new training set so we have "tuned" parameters or values
#See "Default Tree, More Data" for the reloading of the data with more
#customers included
dt_tune = DecisionTreeClassifier()

param_grid = {
    'max_depth': [None, 5, 10, 15, 20, 25],
    'min_samples_leaf': [1, 10, 20, 50, 100],
    'ccp_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1]
}

grid_search = GridSearchCV(dt_tune, param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print(best_params)
print(best_estimator)
{'ccp_alpha': 0, 'max_depth': 5, 'min_samples_leaf': 1}
DecisionTreeClassifier(ccp_alpha=0, max_depth=5)
In [ ]:
# create an instance of a decision tree classifier using default values

dt_tuned_more_data = DecisionTreeClassifier(max_depth = 5, min_samples_leaf=1, ccp_alpha = 0)

# fit the model to the training data
dt_tuned_more_data.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0, max_depth=5)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0, max_depth=5)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_tuned_more_data, filled=True, rounded=True, feature_names=X.columns, class_names=['0', '1'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_tuned_more_data")
Out[ ]:
'decision_tree_tuned_more_data.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_tuned_more_data,
                'tree.dot',
                class_names=['0','1'],
                feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7ff458ffff10>
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_tuned_more_data.predict(X_train)
y_pred_test = dt_tuned_more_data.predict(X_test)

y_prob_train = dt_tuned_more_data.predict_proba(X_train)
y_prob_test = dt_tuned_more_data.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train)
rec_train = recall_score(y_train, y_pred_train)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall.  : {:.4f}".format(rec_train))
print("")

# calculate the accuracy, precision, and recall scores for the testing set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test)
rec_test = recall_score(y_test, y_pred_test)

# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall.  : {:.4f}".format(rec_test))
 -- train set -- 
Accuracy : 0.8179
Precision: 0.6671
Recall.  : 0.5403

 -- test set -- 
Accuracy : 0.8230
Precision: 0.6482
Recall.  : 0.5445
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)

# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[9424 1584]
 [ 929 1862]]
[[4109  635]
 [ 412  759]]
In [ ]:
# Generate ROC curve for training data

fpr, tpr, thresholds = roc_curve(y_train.cat.codes, y_prob_train[:,1])
roc_auc = roc_auc_score(y_train.cat.codes, y_prob_train[:,1])

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
In [ ]:
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, y_prob_test[:,1])
roc_auc = roc_auc_score(y_test.cat.codes, y_prob_test[:,1])

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
In [ ]:
#calculate feature importance
tree_imp = dt_tuned_more_data.feature_importances_

#create a data frame with feature names

# creating a list of column names
column_values = ['importance']

# creating the dataframe
df = pd.DataFrame(data = tree_imp,
                  columns = column_values)
df['feature']=X.columns

#sort data so features with largest importance values are at the top
df2 = df.sort_values(by=['importance'], ascending=False)

print(df2)

#Create variable importance plot
plt.figure(figsize=(5, 10))
plt.title('Variable Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
sns.barplot(data=df2,
            y=df2['feature'],
            x=df2['importance'], color="lightblue")

plt.show()
   importance    feature
1    0.503338     DemAge
0    0.337206    DemAffl
2    0.158338  DemGender
4    0.001118  PromSpend
3    0.000000  PromClass

Modified Tuned Tree, More Data¶

In [ ]:
# create an instance of a decision tree classifier using default values

dt_tuned_mod_more_data = DecisionTreeClassifier(max_depth = 5, min_samples_leaf=10, ccp_alpha = 0)

# fit the model to the training data
dt_tuned_mod_more_data.fit(X_train, y_train)
Out[ ]:
DecisionTreeClassifier(ccp_alpha=0, max_depth=5, min_samples_leaf=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0, max_depth=5, min_samples_leaf=10)
In [ ]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
# visualize the decision tree
dot_data = export_graphviz(dt_tuned_mod_more_data, filled=True, rounded=True, feature_names=X.columns, class_names=['0', '1'])
graph = graphviz.Source(dot_data)
graph.render("decision_tree_tuned_more_data_mod")
Out[ ]:
'decision_tree_tuned_more_data_mod.pdf'
In [ ]:
#Show the visualization of the decision tree in this notebook
export_graphviz(dt_tuned_mod_more_data,
                'tree.dot',
                class_names=['0','1'],
                feature_names = X_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[ ]:
<matplotlib.image.AxesImage at 0x7ff45e1b2290>
In [ ]:
# make predictions on the training and test data
y_pred_train = dt_tuned_mod_more_data.predict(X_train)
y_pred_test = dt_tuned_mod_more_data.predict(X_test)

y_prob_train = dt_tuned_mod_more_data.predict_proba(X_train)
y_prob_test = dt_tuned_mod_more_data.predict_proba(X_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, y_pred_train)
prec_train = precision_score(y_train, y_pred_train)
rec_train = recall_score(y_train, y_pred_train)

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall.  : {:.4f}".format(rec_train))
print("")

# calculate the accuracy, precision, and recall scores for the testing set
acc_test = accuracy_score(y_test, y_pred_test)
prec_test = precision_score(y_test, y_pred_test)
rec_test = recall_score(y_test, y_pred_test)

# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall.  : {:.4f}".format(rec_test))
 -- train set -- 
Accuracy : 0.8177
Precision: 0.6657
Recall.  : 0.5421

 -- test set -- 
Accuracy : 0.8235
Precision: 0.6486
Recall.  : 0.5481
In [ ]:
# confusion matrix for training set
conf_matrix = confusion_matrix(y_pred_train, y_train)
print(conf_matrix)

# Generate confusion matrix for test set
conf_matrix = confusion_matrix(y_pred_test, y_test)
print(conf_matrix)
[[9415 1578]
 [ 938 1868]]
[[4107  630]
 [ 414  764]]
In [ ]:
# Generate ROC curve for training data

fpr, tpr, thresholds = roc_curve(y_train.cat.codes, y_prob_train[:,1])
roc_auc = roc_auc_score(y_train.cat.codes, y_prob_train[:,1])

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
In [ ]:
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, y_prob_test[:,1])
roc_auc = roc_auc_score(y_test.cat.codes, y_prob_test[:,1])

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
In [ ]:
#calculate feature importance
tree_imp = dt_tuned_mod_more_data.feature_importances_

#create a data frame with feature names

# creating a list of column names
column_values = ['importance']

# creating the dataframe
df = pd.DataFrame(data = tree_imp,
                  columns = column_values)
df['feature']=X.columns

#sort data so features with largest importance values are at the top
df2 = df.sort_values(by=['importance'], ascending=False)

print(df2)

#Create variable importance plot
plt.figure(figsize=(40, 20))
plt.title('Variable Importance', fontsize=18)
plt.xlabel('Importance', fontsize=18)
plt.ylabel('Feature', fontsize=18)
plt.tick_params(axis='x', which='major', labelsize=18)
plt.tick_params(axis='y', which='major', labelsize=25)
sns.barplot(data=df2,
            y=df2['feature'],
            x=df2['importance'], color="lightblue")

plt.show()
   importance    feature
1    0.505439     DemAge
0    0.338826    DemAffl
2    0.155735  DemGender
3    0.000000  PromClass
4    0.000000  PromSpend

Full Logistic¶

Reload the data¶

In [7]:
#Reload the data to include all of the columns except for the ones that were
#determined to be detrimental to machine learning models as it is in the
#original tree models
df = pd.read_csv(data_path)

# preview data
df.info()
df.head
print(df)

# total count of NaN values
print(df.isnull().sum())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               22223 non-null  int64  
 1   DemAffl          21138 non-null  float64
 2   DemAge           20715 non-null  float64
 3   DemCluster       21549 non-null  float64
 4   DemClusterGroup  21549 non-null  object 
 5   DemGender        19711 non-null  object 
 6   DemReg           21758 non-null  object 
 7   DemTVReg         21758 non-null  object 
 8   PromClass        22223 non-null  object 
 9   PromSpend        22223 non-null  float64
 10  PromTime         21942 non-null  float64
 11  TargetBuy        22223 non-null  int64  
 12  TargetAmt        22223 non-null  int64  
dtypes: float64(5), int64(3), object(5)
memory usage: 2.2+ MB
             ID  DemAffl  DemAge  DemCluster DemClusterGroup DemGender  \
0           140     10.0    76.0        16.0               C         U   
1           620      4.0    49.0        35.0               D         U   
2           868      5.0    70.0        27.0               D         F   
3          1120     10.0    65.0        51.0               F         M   
4          2313     11.0    68.0         4.0               A         F   
...         ...      ...     ...         ...             ...       ...   
22218  52834058     13.0    65.0         9.0               B         F   
22219  52834376     15.0    73.0        34.0               D         U   
22220  52837057      9.0    70.0        15.0               B         F   
22221  52838096     11.0    66.0         8.0               B         F   
22222  52856469      3.0    75.0        53.0               F       NaN   

           DemReg      DemTVReg PromClass  PromSpend  PromTime  TargetBuy  \
0        Midlands  Wales & West      Gold   16000.00       4.0          0   
1        Midlands  Wales & West      Gold    6000.00       5.0          0   
2        Midlands  Wales & West    Silver       0.02       8.0          1   
3        Midlands      Midlands       Tin       0.01       7.0          1   
4        Midlands      Midlands       Tin       0.01       8.0          0   
...           ...           ...       ...        ...       ...        ...   
22218  South East        London    Silver    1500.00       5.0          0   
22219  South East    S & S East      Gold    6053.06      12.0          0   
22220       North     Yorkshire      Gold    6000.00       5.0          0   
22221       North        N West    Silver    5000.00       5.0          0   
22222  South West        S West      Gold    6000.00       2.0          0   

       TargetAmt  
0              0  
1              0  
2              1  
3              1  
4              0  
...          ...  
22218          0  
22219          0  
22220          0  
22221          0  
22222          0  

[22223 rows x 13 columns]
ID                    0
DemAffl            1085
DemAge             1508
DemCluster          674
DemClusterGroup     674
DemGender          2512
DemReg              465
DemTVReg            465
PromClass             0
PromSpend             0
PromTime            281
TargetBuy             0
TargetAmt             0
dtype: int64

Drop Or Replace Missing Values¶

In [8]:
#Replace missing values in gender with unspecified which is a category that
#already exists in this data
df['DemGender'].fillna('U', inplace=True)
df.dropna(subset='DemAge', inplace=True)
print(df.isnull().sum())
ID                    0
DemAffl            1001
DemAge                0
DemCluster          620
DemClusterGroup     620
DemGender             0
DemReg              432
DemTVReg            432
PromClass             0
PromSpend             0
PromTime            259
TargetBuy             0
TargetAmt             0
dtype: int64
In [9]:
#Drop customers with missing values
df.dropna(subset=['DemAffl', 'DemCluster', 'DemReg', 'PromTime'], inplace=True)
print(df.isnull().sum())
ID                 0
DemAffl            0
DemAge             0
DemCluster         0
DemClusterGroup    0
DemGender          0
DemReg             0
DemTVReg           0
PromClass          0
PromSpend          0
PromTime           0
TargetBuy          0
TargetAmt          0
dtype: int64

Preprocess The Data¶

In [10]:
#Make categorical columns into numerical dummies where each category is replaced by a number starting from 0
def dummyize(x, columns):
    for column in range(0,len(columns)):
        if x == columns[column]:
            return column
In [11]:
#Make the categorical columns into dummies using the dummyize function so they can be used in a decision tree and logistic regression models
df["DemGender"] = df["DemGender"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemGender']).columns))
df["DemClusterGroup"] = df["DemClusterGroup"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemClusterGroup']).columns))
df["DemReg"] = df["DemReg"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemReg']).columns))
df["PromClass"] = df["PromClass"].apply(lambda x: dummyize(x, pd.get_dummies(df['PromClass']).columns))
In [12]:
#Drop columns that would be detrimental to machine learning
#ID is the unique identificator for each customer
#TargetAmt gives away if a customer brought organic products
#DemTVReg has too many categories
df.drop(['DemTVReg', 'TargetAmt', 'ID'], axis=1, inplace=True)
In [13]:
#Make TargetBuy of type category so it can be used in training models
df['TargetBuy'] = df['TargetBuy'].astype('category')
In [14]:
#Split the data on what the model is learning to predict, whether the customer
#brought organic products from the rest of the data
X = df.drop('TargetBuy', axis=1)

y = df['TargetBuy']

#Split the data into training and test sets to be able to train and compare models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=77)

X_train.info()
y_train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 12957 entries, 779 to 18057
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   DemAffl          12957 non-null  float64
 1   DemAge           12957 non-null  float64
 2   DemCluster       12957 non-null  float64
 3   DemClusterGroup  12957 non-null  int64  
 4   DemGender        12957 non-null  int64  
 5   DemReg           12957 non-null  int64  
 6   PromClass        12957 non-null  int64  
 7   PromSpend        12957 non-null  float64
 8   PromTime         12957 non-null  float64
dtypes: float64(5), int64(4)
memory usage: 1012.3 KB
<class 'pandas.core.series.Series'>
Int64Index: 12957 entries, 779 to 18057
Series name: TargetBuy
Non-Null Count  Dtype   
--------------  -----   
12957 non-null  category
dtypes: category(1)
memory usage: 114.0 KB

Create And Assess The Model¶

In [15]:
#Fit full logistic regression model to examine the significance of the terms
X = X_train
X = sm.add_constant(X)
y=y_train

logit_full = sm.Logit(y, X).fit()

#Print summary
print(logit_full.summary())
Optimization terminated successfully.
         Current function value: 0.419787
         Iterations 7
                           Logit Regression Results                           
==============================================================================
Dep. Variable:              TargetBuy   No. Observations:                12957
Model:                          Logit   Df Residuals:                    12947
Method:                           MLE   Df Model:                            9
Date:                Tue, 21 Nov 2023   Pseudo R-squ.:                  0.2442
Time:                        23:05:42   Log-Likelihood:                -5439.2
converged:                       True   LL-Null:                       -7196.7
Covariance Type:            nonrobust   LLR p-value:                     0.000
===================================================================================
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -0.2589      0.160     -1.622      0.105      -0.572       0.054
DemAffl             0.2612      0.008     33.495      0.000       0.246       0.276
DemAge             -0.0550      0.002    -25.985      0.000      -0.059      -0.051
DemCluster         -0.0010      0.009     -0.111      0.912      -0.018       0.016
DemClusterGroup     0.0334      0.086      0.388      0.698      -0.135       0.202
DemGender          -0.9445      0.038    -25.061      0.000      -1.018      -0.871
DemReg             -0.0041      0.018     -0.227      0.820      -0.040       0.031
PromClass          -0.0446      0.026     -1.735      0.083      -0.095       0.006
PromSpend       -1.732e-06   3.92e-06     -0.442      0.659   -9.42e-06    5.96e-06
PromTime            0.0059      0.006      1.026      0.305      -0.005       0.017
===================================================================================
In [16]:
# Generate predicted values for training set
pprob = logit_full.predict(X)

# Create predicted category for success using 50% cutoff
psuccess = (pprob > 0.5).astype(int)

# Add new variables to the training data set
X_train['p_success'] = psuccess
X_train['p_prob'] = pprob
X_train['y'] = y_train

X_train.info()


# Generate predicted values for test set

X_test = sm.add_constant(X_test)

pprob_test = logit_full.predict(X_test)

# Create predicted category for success using 50% cutoff
psuccess_test = (pprob_test > 0.5).astype(int)

# Add new variables to the response data set
X_test['p_success'] = psuccess_test
X_test['p_prob'] = pprob_test

X_test.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 12957 entries, 779 to 18057
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   DemAffl          12957 non-null  float64 
 1   DemAge           12957 non-null  float64 
 2   DemCluster       12957 non-null  float64 
 3   DemClusterGroup  12957 non-null  int64   
 4   DemGender        12957 non-null  int64   
 5   DemReg           12957 non-null  int64   
 6   PromClass        12957 non-null  int64   
 7   PromSpend        12957 non-null  float64 
 8   PromTime         12957 non-null  float64 
 9   p_success        12957 non-null  int64   
 10  p_prob           12957 non-null  float64 
 11  y                12957 non-null  category
dtypes: category(1), float64(6), int64(5)
memory usage: 1.2 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5553 entries, 13248 to 2753
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   const            5553 non-null   float64
 1   DemAffl          5553 non-null   float64
 2   DemAge           5553 non-null   float64
 3   DemCluster       5553 non-null   float64
 4   DemClusterGroup  5553 non-null   int64  
 5   DemGender        5553 non-null   int64  
 6   DemReg           5553 non-null   int64  
 7   PromClass        5553 non-null   int64  
 8   PromSpend        5553 non-null   float64
 9   PromTime         5553 non-null   float64
 10  p_success        5553 non-null   int64  
 11  p_prob           5553 non-null   float64
dtypes: float64(7), int64(5)
memory usage: 564.0 KB
In [17]:
# Generate confusion matrix for training set
conf_matrix = confusion_matrix(psuccess, y_train)
print(conf_matrix)

# Generate confusion matrix for test set
conf_matrix = confusion_matrix(psuccess_test, y_test)
print(conf_matrix)
[[9250 1814]
 [ 548 1345]]
[[3966  814]
 [ 236  537]]
In [18]:
# Generate ROC curve for training data
fpr, tpr, thresholds = roc_curve(y_train.cat.codes, pprob)
roc_auc = roc_auc_score(y_train.cat.codes, pprob)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Training Data')
plt.legend(loc='lower right')
plt.show()
In [19]:
# Generate ROC curve for test data
fpr, tpr, thresholds = roc_curve(y_test.cat.codes, pprob_test)
roc_auc = roc_auc_score(y_test.cat.codes, pprob_test)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Test Data')
plt.legend(loc='lower right')
plt.show()
In [20]:
# calculate the accuracy, precision, and recall scores for the training set
acc_train = accuracy_score(y_train, psuccess)
prec_train = precision_score(y_train, psuccess)
rec_train = recall_score(y_train, psuccess)

# print the scores for the training set

print("Accuracy (Train) : {:.4f}".format(acc_train))
print("Precision (Train): {:.4f}".format(prec_train))
print("Recall (Train)  : {:.4f}".format(rec_train))
print("")

# calculate the accuracy, precision, and recall scores for the testing set
acc_test = accuracy_score(y_test, psuccess_test)
prec_test = precision_score(y_test, psuccess_test)
rec_test = recall_score(y_test, psuccess_test)

# print the scores for the testing set

print("Accuracy (Test) : {:.4f}".format(acc_test))
print("Precision (Test): {:.4f}".format(prec_test))
print("Recall (Test)  : {:.4f}".format(rec_test))
print("")
Accuracy (Train) : 0.8177
Precision (Train): 0.7105
Recall (Train)  : 0.4258

Accuracy (Test) : 0.8109
Precision (Test): 0.6947
Recall (Test)  : 0.3975

SKLearn Models¶

Reload The Data¶

In [ ]:
#Reload the data
df = pd.read_csv(data_path)

# preview data
df.info()
df.head
print(df)

# total count of NaN values
print(df.isnull().sum())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               22223 non-null  int64  
 1   DemAffl          21138 non-null  float64
 2   DemAge           20715 non-null  float64
 3   DemCluster       21549 non-null  float64
 4   DemClusterGroup  21549 non-null  object 
 5   DemGender        19711 non-null  object 
 6   DemReg           21758 non-null  object 
 7   DemTVReg         21758 non-null  object 
 8   PromClass        22223 non-null  object 
 9   PromSpend        22223 non-null  float64
 10  PromTime         21942 non-null  float64
 11  TargetBuy        22223 non-null  int64  
 12  TargetAmt        22223 non-null  int64  
dtypes: float64(5), int64(3), object(5)
memory usage: 2.2+ MB
             ID  DemAffl  DemAge  DemCluster DemClusterGroup DemGender  \
0           140     10.0    76.0        16.0               C         U   
1           620      4.0    49.0        35.0               D         U   
2           868      5.0    70.0        27.0               D         F   
3          1120     10.0    65.0        51.0               F         M   
4          2313     11.0    68.0         4.0               A         F   
...         ...      ...     ...         ...             ...       ...   
22218  52834058     13.0    65.0         9.0               B         F   
22219  52834376     15.0    73.0        34.0               D         U   
22220  52837057      9.0    70.0        15.0               B         F   
22221  52838096     11.0    66.0         8.0               B         F   
22222  52856469      3.0    75.0        53.0               F       NaN   

           DemReg      DemTVReg PromClass  PromSpend  PromTime  TargetBuy  \
0        Midlands  Wales & West      Gold   16000.00       4.0          0   
1        Midlands  Wales & West      Gold    6000.00       5.0          0   
2        Midlands  Wales & West    Silver       0.02       8.0          1   
3        Midlands      Midlands       Tin       0.01       7.0          1   
4        Midlands      Midlands       Tin       0.01       8.0          0   
...           ...           ...       ...        ...       ...        ...   
22218  South East        London    Silver    1500.00       5.0          0   
22219  South East    S & S East      Gold    6053.06      12.0          0   
22220       North     Yorkshire      Gold    6000.00       5.0          0   
22221       North        N West    Silver    5000.00       5.0          0   
22222  South West        S West      Gold    6000.00       2.0          0   

       TargetAmt  
0              0  
1              0  
2              1  
3              1  
4              0  
...          ...  
22218          0  
22219          0  
22220          0  
22221          0  
22222          0  

[22223 rows x 13 columns]
ID                    0
DemAffl            1085
DemAge             1508
DemCluster          674
DemClusterGroup     674
DemGender          2512
DemReg              465
DemTVReg            465
PromClass             0
PromSpend             0
PromTime            281
TargetBuy             0
TargetAmt             0
dtype: int64

Drop Or Replace Missing Values¶

In [ ]:
#Replace missing values in gender with unspecified which is a category that
#already exists in this data
df['DemGender'].fillna('U', inplace=True)

#Drop customers with missing data
df.dropna(subset='DemAge', inplace=True)
df.dropna(subset='DemAffl', inplace=True)
df.dropna(subset='DemCluster', inplace=True)
df.dropna(subset='DemReg', inplace=True)
df.dropna(subset='PromTime', inplace=True)
print(df.isnull().sum())
ID                 0
DemAffl            0
DemAge             0
DemCluster         0
DemClusterGroup    0
DemGender          0
DemReg             0
DemTVReg           0
PromClass          0
PromSpend          0
PromTime           0
TargetBuy          0
TargetAmt          0
dtype: int64

Preprocess The Data¶

In [ ]:
#Make the categorical columns into dummies using the dummyize function so they can be used in a decision tree and logistic regression models
df["DemGender"] = df["DemGender"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemGender']).columns))
df["DemClusterGroup"] = df["DemClusterGroup"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemClusterGroup']).columns))
df["DemReg"] = df["DemReg"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemReg']).columns))
df["PromClass"] = df["PromClass"].apply(lambda x: dummyize(x, pd.get_dummies(df['PromClass']).columns))
In [ ]:
#Drop columns that would be detrimental to machine learning
#ID is the unique identificator for each customer
#TargetAmt gives away if a customer brought organic products
#DemTVReg has too many categories
df.drop(['DemTVReg', 'TargetAmt', 'ID'], axis=1, inplace=True)
In [ ]:
#Make TargetBuy of type category so it can be used in training models
df['TargetBuy'] = df['TargetBuy'].astype('category')
In [ ]:
#Split the data on what the model is learning to predict, whether the customer
#brought organic products from the rest of the data
X2 = df.drop('TargetBuy', axis=1)

y2 = df['TargetBuy']

#Split the data into training and test sets to be able to train and compare models
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, random_state=77)

X2_train.info()
y2_train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 12957 entries, 779 to 18057
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   DemAffl          12957 non-null  float64
 1   DemAge           12957 non-null  float64
 2   DemCluster       12957 non-null  float64
 3   DemClusterGroup  12957 non-null  int64  
 4   DemGender        12957 non-null  int64  
 5   DemReg           12957 non-null  int64  
 6   PromClass        12957 non-null  int64  
 7   PromSpend        12957 non-null  float64
 8   PromTime         12957 non-null  float64
dtypes: float64(5), int64(4)
memory usage: 1012.3 KB
<class 'pandas.core.series.Series'>
Int64Index: 12957 entries, 779 to 18057
Series name: TargetBuy
Non-Null Count  Dtype   
--------------  -----   
12957 non-null  category
dtypes: category(1)
memory usage: 114.0 KB

Create The LASSO and Ridge Regression Models¶

In [ ]:
# Create an Instance of Logistic Regression for LASSO Selection  using c = 0.1 and c = 0.01

lr_l1_1 = LogisticRegression(penalty='l1', solver='liblinear', C=0.1)
lr_l1_01 = LogisticRegression(penalty='l1', solver='liblinear', C=0.01)

# fit the models to the training data
lr_l1_1.fit(X2_train, y2_train)
lr_l1_01.fit(X2_train, y2_train)

# Create an Instance of Logistic Regression for LASSO Selection  using c = 1 and c = 0.7
lr_l1_10 = LogisticRegression(penalty='l1', solver='liblinear', C=1)
lr_l1_7 = LogisticRegression(penalty='l1', solver='liblinear', C=0.7)

# fit the models to the training data
lr_l1_10.fit(X2_train, y2_train)
lr_l1_7.fit(X2_train, y2_train)

# Create an Instance of Logistic Regression for Ridge Regression (L2 regularization)
lr_l2 = LogisticRegression(penalty='l2', solver='liblinear')

# fit the models to the training data
lr_l2.fit(X2_train, y2_train)
Out[ ]:
LogisticRegression(solver='liblinear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(solver='liblinear')

Analyze The Importance Of Different Categories In The Models¶

In [ ]:
# function for model coefficients
def rpt_model_variables(model):
    # Get the intercept term
    intercept = model.intercept_

    # Access the coefficients (weights) of the model, i rounded them
    coefficients = np.round(model.coef_[0],decimals=4)

    # Create DataFrames for intercept and coefficients
    #df_intercept = pd.DataFrame({'feature': ['Intercept'], 'coefficient': [intercept[0]]})
    df_coefficients = pd.DataFrame({'feature': X2.columns, 'coefficient': coefficients})
    df_coefficients['abs_coefficient'] = df_coefficients['coefficient'].abs()
    df_coefficients.sort_values(by='abs_coefficient', ascending=False, inplace=True)

    # if you want to add intercept to table
    #df_model = pd.concat([df_intercept, df_coefficients]).reset_index(drop=True)

    # Print the DataFrame
    print(df_coefficients)

    return df_coefficients

#Evalute the model coefficients for the models
print("Lasso C=0.1")
df_coefficients1 = rpt_model_variables(lr_l1_1)
print("")
print("Lasso C=0.01")
df_coefficients01 = rpt_model_variables(lr_l1_01)
print("")
print("Lasso C=1")
df_coefficients10 = rpt_model_variables(lr_l1_10)
print("")
print("Lasso C=0.7")
df_coefficients7 = rpt_model_variables(lr_l1_7)
print("")
print("Ridge Regression")
df_coefficients2 = rpt_model_variables(lr_l2)
Lasso C=0.1
           feature  coefficient  abs_coefficient
4        DemGender      -0.9322           0.9322
0          DemAffl       0.2572           0.2572
3  DemClusterGroup       0.1210           0.1210
1           DemAge      -0.0560           0.0560
6        PromClass      -0.0468           0.0468
2       DemCluster      -0.0105           0.0105
5           DemReg      -0.0057           0.0057
8         PromTime       0.0047           0.0047
7        PromSpend      -0.0000           0.0000

Lasso C=0.01
           feature  coefficient  abs_coefficient
4        DemGender      -0.8159           0.8159
0          DemAffl       0.2408           0.2408
1           DemAge      -0.0567           0.0567
6        PromClass      -0.0113           0.0113
2       DemCluster       0.0001           0.0001
3  DemClusterGroup       0.0000           0.0000
5           DemReg       0.0000           0.0000
7        PromSpend       0.0000           0.0000
8         PromTime       0.0000           0.0000

Lasso C=1
           feature  coefficient  abs_coefficient
4        DemGender      -0.9420           0.9420
0          DemAffl       0.2619           0.2619
1           DemAge      -0.0542           0.0542
6        PromClass      -0.0394           0.0394
3  DemClusterGroup      -0.0067           0.0067
8         PromTime       0.0056           0.0056
2       DemCluster       0.0032           0.0032
5           DemReg      -0.0026           0.0026
7        PromSpend      -0.0000           0.0000

Lasso C=0.7
           feature  coefficient  abs_coefficient
4        DemGender      -0.9398           0.9398
0          DemAffl       0.2632           0.2632
1           DemAge      -0.0534           0.0534
6        PromClass      -0.0332           0.0332
8         PromTime       0.0052           0.0052
3  DemClusterGroup       0.0050           0.0050
2       DemCluster       0.0023           0.0023
5           DemReg      -0.0005           0.0005
7        PromSpend      -0.0000           0.0000

Ridge Regression
           feature  coefficient  abs_coefficient
4        DemGender      -0.8877           0.8877
0          DemAffl       0.2501           0.2501
6        PromClass      -0.0607           0.0607
1           DemAge      -0.0571           0.0571
5           DemReg       0.0192           0.0192
3  DemClusterGroup       0.0151           0.0151
8         PromTime       0.0045           0.0045
2       DemCluster      -0.0001           0.0001
7        PromSpend      -0.0000           0.0000
In [ ]:
# plot variable importance

# function to plot variable importance by creating a bar chart
# of absolute coefficients
def plot_variable_imp(df_coef):
  # determine the variables the model is using and create df
  # of their absolute coefficients
  df_plt = df_coef[df_coef['abs_coefficient'] != 0]
  # determine the variables the model is not using
  reject_vars = df_coef[df_coef['abs_coefficient'] == 0]['feature'].tolist()

  # bar graph of the absolute coefficients that the model is using
  plt.figure(figsize=(5, 10))
  plt.title('Variable Importance')
  plt.xlabel('Coefficient')
  plt.ylabel('Feature')
  sns.barplot(data=df_plt,
                     y=df_plt['feature'],
                     x=df_plt['abs_coefficient'], color="lightblue")

  plt.show()
  # print the variables the model is not using after the bar graph
  print("-- rejected --")
  for i in reject_vars:
    print(f" {i}")

# plot the variable importance for the models
print("Lasso C=0.1")
plot_variable_imp(df_coefficients1)
print("")
print("Lasso C=0.01")
plot_variable_imp(df_coefficients01)
print("")
print("Lasso C=1")
plot_variable_imp(df_coefficients10)
print("")
print("Lasso C=0.7")
plot_variable_imp(df_coefficients7)
print("")
print("Ridge Regression")
plot_variable_imp(df_coefficients2)
Lasso C=0.1
-- rejected --
 PromSpend

Lasso C=0.01
-- rejected --
 DemClusterGroup
 DemReg
 PromSpend
 PromTime

Lasso C=1
-- rejected --
 PromSpend

Lasso C=0.7
-- rejected --
 PromSpend

Ridge Regression
-- rejected --
 PromSpend

Make Predictions To Evaluate The Models¶

In [ ]:
# make predictions on the training and testing data for all of the models to
# evaluate the models

#Lasso C=0.1
y_pred_train = lr_l1_1.predict(X2_train)
y_pred_test = lr_l1_1.predict(X2_test)
y_proba_train = lr_l1_1.predict_proba(X2_train)
y_proba_test = lr_l1_1.predict_proba(X2_test)

#Lasso C=0.01
y_pred_train1 = lr_l1_01.predict(X2_train)
y_pred_test1 = lr_l1_01.predict(X2_test)
y_proba_train1 = lr_l1_01.predict_proba(X2_train)
y_proba_test1 = lr_l1_01.predict_proba(X2_test)

#Lasso C=1
y_pred_train10 = lr_l1_10.predict(X2_train)
y_pred_test10 = lr_l1_10.predict(X2_test)
y_proba_train10 = lr_l1_10.predict_proba(X2_train)
y_proba_test10 = lr_l1_10.predict_proba(X2_test)

#Lasso C=0.7
y_pred_train7 = lr_l1_7.predict(X2_train)
y_pred_test7 = lr_l1_7.predict(X2_test)
y_proba_train7 = lr_l1_7.predict_proba(X2_train)
y_proba_test7 = lr_l1_7.predict_proba(X2_test)

#Ridge Regression
y_pred_train2 = lr_l2.predict(X2_train)
y_pred_test2 = lr_l2.predict(X2_test)
y_proba_train2 = lr_l2.predict_proba(X2_train)
y_proba_test2 = lr_l2.predict_proba(X2_test)

Evaluate The Models¶

L1 with c=0.1

In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc2_train = accuracy_score(y2_train, y_pred_train)
prec2_train = precision_score(y2_train, y_pred_train)
rec2_train = recall_score(y2_train, y_pred_train)
auc2_train = roc_auc_score(y2_train, y_proba_train[:,1])

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train))
print("Precision: {:.4f}".format(prec2_train))
print("Recall.  : {:.4f}".format(rec2_train))
print("AUC      : {:.4f}".format(auc2_train))
print("")

# calculate the accuracy, precision, and recall scores for the testing set
acc2_test = accuracy_score(y2_test, y_pred_test)
prec2_test = precision_score(y2_test, y_pred_test)
rec2_test = recall_score(y2_test, y_pred_test)
auc2_test = roc_auc_score(y2_test, y_proba_test[:,1])

# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test))
print("Precision: {:.4f}".format(prec2_test))
print("Recall.  : {:.4f}".format(rec2_test))
print("AUC      : {:.4f}".format(auc2_test))
 -- train set -- 
Accuracy : 0.8180
Precision: 0.7116
Recall.  : 0.4264
AUC      : 0.8166

 -- test set -- 
Accuracy : 0.8115
Precision: 0.6939
Recall.  : 0.4027
AUC      : 0.8008

L1 with c=0.01

In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc2_train1 = accuracy_score(y2_train, y_pred_train1)
prec2_train1 = precision_score(y2_train, y_pred_train1)
rec2_train1 = recall_score(y2_train, y_pred_train1)
auc2_train1 = roc_auc_score(y2_train, y_proba_train1[:,1])

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train1))
print("Precision: {:.4f}".format(prec2_train1))
print("Recall.  : {:.4f}".format(rec2_train1))
print("AUC      : {:.4f}".format(auc2_train1))
print("")

# calculate the accuracy, precision, and recall scores for the testing set
acc2_test1 = accuracy_score(y2_test, y_pred_test1)
prec2_test1 = precision_score(y2_test, y_pred_test1)
rec2_test1 = recall_score(y2_test, y_pred_test1)
auc2_test1 = roc_auc_score(y2_test, y_proba_test1[:,1])

# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test1))
print("Precision: {:.4f}".format(prec2_test1))
print("Recall.  : {:.4f}".format(rec2_test1))
print("AUC      : {:.4f}".format(auc2_test1))
 -- train set -- 
Accuracy : 0.8186
Precision: 0.7224
Recall.  : 0.4160
AUC      : 0.8153

 -- test set -- 
Accuracy : 0.8118
Precision: 0.7051
Recall.  : 0.3893
AUC      : 0.8005

L1 with C=1

In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc2_train10 = accuracy_score(y2_train, y_pred_train10)
prec2_train10 = precision_score(y2_train, y_pred_train10)
rec2_train10 = recall_score(y2_train, y_pred_train10)
auc2_train10 = roc_auc_score(y2_train, y_proba_train10[:,1])

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train10))
print("Precision: {:.4f}".format(prec2_train10))
print("Recall.  : {:.4f}".format(rec2_train10))
print("AUC      : {:.4f}".format(auc2_train10))
print("")

# calculate the accuracy, precision, and recall scores for the testing set
acc2_test10 = accuracy_score(y2_test, y_pred_test10)
prec2_test10 = precision_score(y2_test, y_pred_test10)
rec2_test10 = recall_score(y2_test, y_pred_test10)
auc2_test10 = roc_auc_score(y2_test, y_proba_test10[:,1])

# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test10))
print("Precision: {:.4f}".format(prec2_test10))
print("Recall.  : {:.4f}".format(rec2_test10))
print("AUC      : {:.4f}".format(auc2_test10))
 -- train set -- 
Accuracy : 0.8180
Precision: 0.7118
Recall.  : 0.4261
AUC      : 0.8167

 -- test set -- 
Accuracy : 0.8098
Precision: 0.6913
Recall.  : 0.3945
AUC      : 0.8009

L1 with C=0.7

In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc2_train7 = accuracy_score(y2_train, y_pred_train7)
prec2_train7 = precision_score(y2_train, y_pred_train7)
rec2_train7 = recall_score(y2_train, y_pred_train7)
auc2_train7 = roc_auc_score(y2_train, y_proba_train7[:,1])

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train7))
print("Precision: {:.4f}".format(prec2_train7))
print("Recall.  : {:.4f}".format(rec2_train7))
print("AUC      : {:.4f}".format(auc2_train7))
print("")

# calculate the accuracy, precision, and recall scores for the testing set
acc2_test7 = accuracy_score(y2_test, y_pred_test7)
prec2_test7 = precision_score(y2_test, y_pred_test7)
rec2_test7 = recall_score(y2_test, y_pred_test7)
auc2_test7 = roc_auc_score(y2_test, y_proba_test7[:,1])

# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test7))
print("Precision: {:.4f}".format(prec2_test7))
print("Recall.  : {:.4f}".format(rec2_test7))
print("AUC      : {:.4f}".format(auc2_test7))
 -- train set -- 
Accuracy : 0.8180
Precision: 0.7127
Recall.  : 0.4248
AUC      : 0.8167

 -- test set -- 
Accuracy : 0.8100
Precision: 0.6922
Recall.  : 0.3945
AUC      : 0.8009

L2 regularization

In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc2_train2 = accuracy_score(y2_train, y_pred_train2)
prec2_train2 = precision_score(y2_train, y_pred_train2)
rec2_train2 = recall_score(y2_train, y_pred_train2)
auc2_train2 = roc_auc_score(y2_train, y_proba_train2[:,1])

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train2))
print("Precision: {:.4f}".format(prec2_train2))
print("Recall.  : {:.4f}".format(rec2_train2))
print("AUC      : {:.4f}".format(auc2_train2))
print("")

# calculate the accuracy, precision, and recall scores for the testing set
acc2_test2 = accuracy_score(y2_test, y_pred_test2)
prec2_test2 = precision_score(y2_test, y_pred_test2)
rec2_test2 = recall_score(y2_test, y_pred_test2)
auc2_test2 = roc_auc_score(y2_test, y_proba_test2[:,1])

# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test2))
print("Precision: {:.4f}".format(prec2_test2))
print("Recall.  : {:.4f}".format(rec2_test2))
print("AUC      : {:.4f}".format(auc2_test2))
 -- train set -- 
Accuracy : 0.8185
Precision: 0.7171
Recall.  : 0.4220
AUC      : 0.8162

 -- test set -- 
Accuracy : 0.8124
Precision: 0.6999
Recall.  : 0.4004
AUC      : 0.8007

L1 c=0.01, More Data¶

Reload The Data¶

In [ ]:
#Reload the data to include data from customers who were dropped for missing
#data in categories that were not used in the original L1 c=0.01 model
df = pd.read_csv(data_path)

# preview data
df.info()
df.head
print(df)

# total count of NaN values
print(df.isnull().sum())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               22223 non-null  int64  
 1   DemAffl          21138 non-null  float64
 2   DemAge           20715 non-null  float64
 3   DemCluster       21549 non-null  float64
 4   DemClusterGroup  21549 non-null  object 
 5   DemGender        19711 non-null  object 
 6   DemReg           21758 non-null  object 
 7   DemTVReg         21758 non-null  object 
 8   PromClass        22223 non-null  object 
 9   PromSpend        22223 non-null  float64
 10  PromTime         21942 non-null  float64
 11  TargetBuy        22223 non-null  int64  
 12  TargetAmt        22223 non-null  int64  
dtypes: float64(5), int64(3), object(5)
memory usage: 2.2+ MB
             ID  DemAffl  DemAge  DemCluster DemClusterGroup DemGender  \
0           140     10.0    76.0        16.0               C         U   
1           620      4.0    49.0        35.0               D         U   
2           868      5.0    70.0        27.0               D         F   
3          1120     10.0    65.0        51.0               F         M   
4          2313     11.0    68.0         4.0               A         F   
...         ...      ...     ...         ...             ...       ...   
22218  52834058     13.0    65.0         9.0               B         F   
22219  52834376     15.0    73.0        34.0               D         U   
22220  52837057      9.0    70.0        15.0               B         F   
22221  52838096     11.0    66.0         8.0               B         F   
22222  52856469      3.0    75.0        53.0               F       NaN   

           DemReg      DemTVReg PromClass  PromSpend  PromTime  TargetBuy  \
0        Midlands  Wales & West      Gold   16000.00       4.0          0   
1        Midlands  Wales & West      Gold    6000.00       5.0          0   
2        Midlands  Wales & West    Silver       0.02       8.0          1   
3        Midlands      Midlands       Tin       0.01       7.0          1   
4        Midlands      Midlands       Tin       0.01       8.0          0   
...           ...           ...       ...        ...       ...        ...   
22218  South East        London    Silver    1500.00       5.0          0   
22219  South East    S & S East      Gold    6053.06      12.0          0   
22220       North     Yorkshire      Gold    6000.00       5.0          0   
22221       North        N West    Silver    5000.00       5.0          0   
22222  South West        S West      Gold    6000.00       2.0          0   

       TargetAmt  
0              0  
1              0  
2              1  
3              1  
4              0  
...          ...  
22218          0  
22219          0  
22220          0  
22221          0  
22222          0  

[22223 rows x 13 columns]
ID                    0
DemAffl            1085
DemAge             1508
DemCluster          674
DemClusterGroup     674
DemGender          2512
DemReg              465
DemTVReg            465
PromClass             0
PromSpend             0
PromTime            281
TargetBuy             0
TargetAmt             0
dtype: int64

Preprocess The Data¶

In [ ]:
#Replace missing values in gender with unspecified which is a category that
#already exists in this data
df['DemGender'].fillna('U', inplace=True)

#Drop customers with data missing in columns that will be used in the model
df.dropna(subset=['DemAffl', 'DemAge', 'DemCluster'], inplace=True)

print(df.isnull().sum()) #Show where there are still missing values
ID                   0
DemAffl              0
DemAge               0
DemCluster           0
DemClusterGroup      0
DemGender            0
DemReg             399
DemTVReg           399
PromClass            0
PromSpend            0
PromTime           225
TargetBuy            0
TargetAmt            0
dtype: int64
In [ ]:
#Drop the columns that were not used in the previous L1 C=0.01 model
#so we can use the data from customers with values missing in those columns
#
#Drop columns that would be detrimental to machine learning
#ID is the unique identificator for each customer
#TargetAmt gives away if a customer brought organic products
#DemTVReg has too many categories
df.drop(['DemReg', 'DemTVReg', 'PromTime', 'ID', 'TargetAmt'], axis=1, inplace=True)
print(df.isnull().sum())
DemAffl            0
DemAge             0
DemCluster         0
DemClusterGroup    0
DemGender          0
PromClass          0
PromSpend          0
TargetBuy          0
dtype: int64
In [ ]:
#Make the categorical columns into dummies using the dummyize function so they can be used in a decision tree and logistic regression models
df["DemGender"] = df["DemGender"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemGender']).columns))
df["DemClusterGroup"] = df["DemClusterGroup"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemClusterGroup']).columns))
df["PromClass"] = df["PromClass"].apply(lambda x: dummyize(x, pd.get_dummies(df['PromClass']).columns))
In [ ]:
#Make TargetBuy of type category so it can be used in training models
df['TargetBuy'] = df['TargetBuy'].astype('category')
In [ ]:
#Split the data on what the model is learning to predict, whether the customer
#brought organic products from the rest of the data
X2 = df.drop('TargetBuy', axis=1)

y2 = df['TargetBuy']

#Split the data into training and test sets to be able to train and compare models
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, random_state=77)

X2_train.info()
y2_train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 13390 entries, 9855 to 17474
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   DemAffl          13390 non-null  float64
 1   DemAge           13390 non-null  float64
 2   DemCluster       13390 non-null  float64
 3   DemClusterGroup  13390 non-null  int64  
 4   DemGender        13390 non-null  int64  
 5   PromClass        13390 non-null  int64  
 6   PromSpend        13390 non-null  float64
dtypes: float64(4), int64(3)
memory usage: 836.9 KB
<class 'pandas.core.series.Series'>
Int64Index: 13390 entries, 9855 to 17474
Series name: TargetBuy
Non-Null Count  Dtype   
--------------  -----   
13390 non-null  category
dtypes: category(1)
memory usage: 117.8 KB

Create And Assess The Model¶

In [ ]:
#Create the L1 C=0.01 model with the new data
lr_l1_01_2 = LogisticRegression(penalty='l1', solver='liblinear', C=0.01)
lr_l1_01_2.fit(X2_train, y2_train)
Out[ ]:
LogisticRegression(C=0.01, penalty='l1', solver='liblinear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(C=0.01, penalty='l1', solver='liblinear')
In [ ]:
#Evaluate the coefficients
df_coefficients01_2 = rpt_model_variables(lr_l1_01_2)
           feature  coefficient  abs_coefficient
4        DemGender      -0.7951           0.7951
0          DemAffl       0.2360           0.2360
1           DemAge      -0.0560           0.0560
5        PromClass      -0.0018           0.0018
2       DemCluster      -0.0005           0.0005
3  DemClusterGroup       0.0000           0.0000
6        PromSpend       0.0000           0.0000
In [ ]:
# plot variable importance

#Same function as plot_variable_imp except it prints a larger bar chart
# function to plot variable importance by creating a bar chart
# of absolute coefficients
def plot_variable_imp_larger(df_coef):
  # determine the variables the model is using and create df
  # of their absolute coefficients
  df_plt = df_coef[df_coef['abs_coefficient'] != 0]
  # determine the variables the model is not using
  reject_vars = df_coef[df_coef['abs_coefficient'] == 0]['feature'].tolist()

  # bar graph of the absolute coefficients that the model is using
  plt.figure(figsize=(40, 20)) #Create larger chart
  plt.title('Variable Importance', fontsize=18)
  plt.xlabel('Coefficient', fontsize=18)
  plt.ylabel('Feature', fontsize=18)
  plt.tick_params(axis='x', which='major', labelsize=18)
  plt.tick_params(axis='y', which='major', labelsize=25)
  sns.barplot(data=df_plt,
                     y=df_plt['feature'],
                     x=df_plt['abs_coefficient'], color="lightblue")

  plt.show()
  # print the variables the model is not using after the bar graph
  print("-- rejected --")
  for i in reject_vars:
    print(f" {i}")
In [ ]:
# plot the variable importance for this model
plot_variable_imp_larger(df_coefficients01_2)
-- rejected --
 DemClusterGroup
 PromSpend
In [ ]:
#Make predictions on the training and test data to evaluate the model
y_pred_train01_2 = lr_l1_01_2.predict(X2_train)
y_pred_test01_2 = lr_l1_01_2.predict(X2_test)
y_proba_train01_2 = lr_l1_01_2.predict_proba(X2_train)
y_proba_test01_2 = lr_l1_01_2.predict_proba(X2_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc2_train01_2 = accuracy_score(y2_train, y_pred_train01_2)
prec2_train01_2 = precision_score(y2_train, y_pred_train01_2)
rec2_train01_2 = recall_score(y2_train, y_pred_train01_2)
auc2_train01_2 = roc_auc_score(y2_train, y_proba_train01_2[:,1])

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train01_2))
print("Precision: {:.4f}".format(prec2_train01_2))
print("Recall.  : {:.4f}".format(rec2_train01_2))
print("AUC      : {:.4f}".format(auc2_train01_2))
print("")

# calculate the accuracy, precision, and recall scores for the testing set
acc2_test01_2 = accuracy_score(y2_test, y_pred_test01_2)
prec2_test01_2 = precision_score(y2_test, y_pred_test01_2)
rec2_test01_2 = recall_score(y2_test, y_pred_test01_2)
auc2_test01_2 = roc_auc_score(y2_test, y_proba_test01_2[:,1])

# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test01_2))
print("Precision: {:.4f}".format(prec2_test01_2))
print("Recall.  : {:.4f}".format(rec2_test01_2))
print("AUC      : {:.4f}".format(auc2_test01_2))
 -- train set -- 
Accuracy : 0.8184
Precision: 0.7268
Recall.  : 0.4059
AUC      : 0.8094

 -- test set -- 
Accuracy : 0.8116
Precision: 0.7127
Recall.  : 0.4000
AUC      : 0.8094

L1 c=0.01, More Data 2¶

Reload The Data¶

In [ ]:
#Reload the data to include data from customers who were dropped for missing
#data in categories that were not used in the original L1 c=0.01 model and
#for missing data in DemClusterGroup or neighborhood group since it had
#negligible importance in the model
df = pd.read_csv(data_path)

# preview data
df.info()
df.head
print(df)

# total count of NaN values
print(df.isnull().sum())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               22223 non-null  int64  
 1   DemAffl          21138 non-null  float64
 2   DemAge           20715 non-null  float64
 3   DemCluster       21549 non-null  float64
 4   DemClusterGroup  21549 non-null  object 
 5   DemGender        19711 non-null  object 
 6   DemReg           21758 non-null  object 
 7   DemTVReg         21758 non-null  object 
 8   PromClass        22223 non-null  object 
 9   PromSpend        22223 non-null  float64
 10  PromTime         21942 non-null  float64
 11  TargetBuy        22223 non-null  int64  
 12  TargetAmt        22223 non-null  int64  
dtypes: float64(5), int64(3), object(5)
memory usage: 2.2+ MB
             ID  DemAffl  DemAge  DemCluster DemClusterGroup DemGender  \
0           140     10.0    76.0        16.0               C         U   
1           620      4.0    49.0        35.0               D         U   
2           868      5.0    70.0        27.0               D         F   
3          1120     10.0    65.0        51.0               F         M   
4          2313     11.0    68.0         4.0               A         F   
...         ...      ...     ...         ...             ...       ...   
22218  52834058     13.0    65.0         9.0               B         F   
22219  52834376     15.0    73.0        34.0               D         U   
22220  52837057      9.0    70.0        15.0               B         F   
22221  52838096     11.0    66.0         8.0               B         F   
22222  52856469      3.0    75.0        53.0               F       NaN   

           DemReg      DemTVReg PromClass  PromSpend  PromTime  TargetBuy  \
0        Midlands  Wales & West      Gold   16000.00       4.0          0   
1        Midlands  Wales & West      Gold    6000.00       5.0          0   
2        Midlands  Wales & West    Silver       0.02       8.0          1   
3        Midlands      Midlands       Tin       0.01       7.0          1   
4        Midlands      Midlands       Tin       0.01       8.0          0   
...           ...           ...       ...        ...       ...        ...   
22218  South East        London    Silver    1500.00       5.0          0   
22219  South East    S & S East      Gold    6053.06      12.0          0   
22220       North     Yorkshire      Gold    6000.00       5.0          0   
22221       North        N West    Silver    5000.00       5.0          0   
22222  South West        S West      Gold    6000.00       2.0          0   

       TargetAmt  
0              0  
1              0  
2              1  
3              1  
4              0  
...          ...  
22218          0  
22219          0  
22220          0  
22221          0  
22222          0  

[22223 rows x 13 columns]
ID                    0
DemAffl            1085
DemAge             1508
DemCluster          674
DemClusterGroup     674
DemGender          2512
DemReg              465
DemTVReg            465
PromClass             0
PromSpend             0
PromTime            281
TargetBuy             0
TargetAmt             0
dtype: int64

Preprocess The Data¶

In [ ]:
#Replace missing values in gender with unspecified which is a category that
#already exists in this data
df['DemGender'].fillna('U', inplace=True)

#Drop customers with data missing in columns that will be used in the model
df.dropna(subset=['DemAffl', 'DemAge'], inplace=True)

print(df.isnull().sum()) #Show where there are still missing values
ID                   0
DemAffl              0
DemAge               0
DemCluster         585
DemClusterGroup    585
DemGender            0
DemReg             411
DemTVReg           411
PromClass            0
PromSpend            0
PromTime           243
TargetBuy            0
TargetAmt            0
dtype: int64
In [ ]:
#Drop the columns that were not used in the original L1 C=0.01 model and
#DemClusterGroup or neighborhood group since it had negligible importance in
#both the original and previous L1 C=0.01 models so we can use the data from
#customers with values missing in those columns
#
#Drop columns that would be detrimental to machine learning
#ID is the unique identificator for each customer
#TargetAmt gives away if a customer brought organic products
#DemTVReg has too many categories
df.drop(['DemReg', 'DemTVReg', 'PromTime', 'ID', 'TargetAmt', 'DemCluster', 'DemClusterGroup'], axis=1, inplace=True)
print(df.isnull().sum())
DemAffl      0
DemAge       0
DemGender    0
PromClass    0
PromSpend    0
TargetBuy    0
dtype: int64
In [ ]:
#Make the categorical columns into dummies using the dummyize function so they can be used in a decision tree and logistic regression models
df["DemGender"] = df["DemGender"].apply(lambda x: dummyize(x, pd.get_dummies(df['DemGender']).columns))
df["PromClass"] = df["PromClass"].apply(lambda x: dummyize(x, pd.get_dummies(df['PromClass']).columns))
In [ ]:
#Make TargetBuy of type category so it can be used in training models
df['TargetBuy'] = df['TargetBuy'].astype('category')
In [ ]:
#Split the data on what the model is learning to predict, whether the customer
#brought organic products from the rest of the data
X2 = df.drop('TargetBuy', axis=1)

y2 = df['TargetBuy']

#Split the data into training and test sets to be able to train and compare models
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, random_state=77)

X2_train.info()
y2_train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 13799 entries, 9631 to 16966
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   DemAffl    13799 non-null  float64
 1   DemAge     13799 non-null  float64
 2   DemGender  13799 non-null  int64  
 3   PromClass  13799 non-null  int64  
 4   PromSpend  13799 non-null  float64
dtypes: float64(3), int64(2)
memory usage: 646.8 KB
<class 'pandas.core.series.Series'>
Int64Index: 13799 entries, 9631 to 16966
Series name: TargetBuy
Non-Null Count  Dtype   
--------------  -----   
13799 non-null  category
dtypes: category(1)
memory usage: 121.4 KB

Create And Assess The Model¶

In [ ]:
#Create the L1 C=0.01 model with the new data
lr_l1_01_3 = LogisticRegression(penalty='l1', solver='liblinear', C=0.01)
lr_l1_01_3.fit(X2_train, y2_train)
Out[ ]:
LogisticRegression(C=0.01, penalty='l1', solver='liblinear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(C=0.01, penalty='l1', solver='liblinear')
In [ ]:
#Evaluate the coefficients
df_coefficients01_3 = rpt_model_variables(lr_l1_01_3)
     feature  coefficient  abs_coefficient
2  DemGender      -0.7904           0.7904
0    DemAffl       0.2337           0.2337
1     DemAge      -0.0547           0.0547
3  PromClass       0.0000           0.0000
4  PromSpend      -0.0000           0.0000
In [ ]:
# plot the variable importance for this model
plot_variable_imp(df_coefficients01_3)
-- rejected --
 PromClass
 PromSpend
In [ ]:
#Make predictions on the training and test data to evaluate the model
y_pred_train01_3 = lr_l1_01_3.predict(X2_train)
y_pred_test01_3 = lr_l1_01_3.predict(X2_test)
y_proba_train01_3 = lr_l1_01_3.predict_proba(X2_train)
y_proba_test01_3 = lr_l1_01_3.predict_proba(X2_test)
In [ ]:
# calculate the accuracy, precision, and recall scores for the training set
acc2_train01_3 = accuracy_score(y2_train, y_pred_train01_3)
prec2_train01_3 = precision_score(y2_train, y_pred_train01_3)
rec2_train01_3 = recall_score(y2_train, y_pred_train01_3)
auc2_train01_3 = roc_auc_score(y2_train, y_proba_train01_3[:,1])

# print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc2_train01_3))
print("Precision: {:.4f}".format(prec2_train01_3))
print("Recall.  : {:.4f}".format(rec2_train01_3))
print("AUC      : {:.4f}".format(auc2_train01_3))
print("")

# calculate the accuracy, precision, and recall scores for the testing set
acc2_test01_3 = accuracy_score(y2_test, y_pred_test01_3)
prec2_test01_3 = precision_score(y2_test, y_pred_test01_3)
rec2_test01_3 = recall_score(y2_test, y_pred_test01_3)
auc2_test01_3 = roc_auc_score(y2_test, y_proba_test01_3[:,1])

# print the scores for the testing set
print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc2_test01_3))
print("Precision: {:.4f}".format(prec2_test01_3))
print("Recall.  : {:.4f}".format(rec2_test01_3))
print("AUC      : {:.4f}".format(auc2_test01_3))
 -- train set -- 
Accuracy : 0.8139
Precision: 0.7240
Recall.  : 0.4118
AUC      : 0.8102

 -- test set -- 
Accuracy : 0.8220
Precision: 0.7077
Recall.  : 0.4168
AUC      : 0.8086

Score New Customers¶

Load The Data¶

In [ ]:
# load the data into a pandas dataframe

data_path = '/content/drive/My Drive/New_organics.csv'
dfnew = pd.read_csv(data_path)

# preview data
dfnew.info()
dfnew.head
print(dfnew)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               150 non-null    int64  
 1   DemAffl          150 non-null    int64  
 2   DemAge           150 non-null    int64  
 3   DemCluster       150 non-null    int64  
 4   DemClusterGroup  150 non-null    object 
 5   DemGender        150 non-null    object 
 6   DemReg           150 non-null    object 
 7   DemTVReg         150 non-null    object 
 8   PromClass        150 non-null    object 
 9   PromSpend        150 non-null    float64
 10  PromTime         150 non-null    int64  
dtypes: float64(1), int64(5), object(5)
memory usage: 13.0+ KB
      ID  DemAffl  DemAge  DemCluster DemClusterGroup DemGender      DemReg  \
0      1        6      46          27               D         F  South East   
1      2       10      39          12               B         M    Midlands   
2      3        7      75          12               B         M    Midlands   
3      4        7      72          12               B         F    Midlands   
4      5        7      43          53               F         M       North   
..   ...      ...     ...         ...             ...       ...         ...   
145  146        3      59          12               B         F    Midlands   
146  147        8      69          16               C         M  South East   
147  158        7      27          52               F         F  South East   
148  149       10      63          14               B         M  South East   
149  150       18      41          35               D         F       North   

    DemTVReg PromClass  PromSpend  PromTime  
0     London       Tin       0.01         8  
1     Ulster       Tin       0.01         5  
2       East    Silver    4000.00         7  
3     Ulster    Silver    1250.00        23  
4     N West       Tin       0.01         5  
..       ...       ...        ...       ...  
145     East      Gold   12000.00         4  
146   London      Gold   14400.00         8  
147   London       Tin       0.01         4  
148   London    Silver    3000.00         7  
149   N East       Tin       0.01         5  

[150 rows x 11 columns]

Preprocess The Data¶

In [ ]:
#Drop columns that were not used in the modified tuned decision tree so it can
#score this new data
dfnew.drop(['DemCluster', 'DemClusterGroup', 'DemReg', 'DemTVReg', 'PromTime', 'ID'], axis=1, inplace=True)
print(dfnew.isnull().sum())
DemAffl      0
DemAge       0
DemGender    0
PromClass    0
PromSpend    0
dtype: int64
In [ ]:
#Make the categorical columns into dummies using the dummyize function so they
#can be scored by the modified tuned decision tree
dfnew["DemGender"] = dfnew["DemGender"].apply(lambda x: dummyize(x, pd.get_dummies(dfnew['DemGender']).columns))
dfnew["PromClass"] = dfnew["PromClass"].apply(lambda x: dummyize(x, pd.get_dummies(dfnew['PromClass']).columns))
In [ ]:
# make predictions on the new data
Scored_class = dt_tuned_mod_more_data.predict(dfnew)
Scored_probs = dt_tuned_mod_more_data.predict_proba(dfnew)

# Make the predictions and probabilities into a dataframe
dfnew['pred_Buy'] = Scored_class
dfnew['prod_prob_buy']=Scored_probs[:,1]

print(dfnew) # Print the new dataframe
     DemAffl  DemAge  DemGender  PromClass  PromSpend  pred_Buy  prod_prob_buy
0          6      46          0          3       0.01         0       0.144857
1         10      39          1          3       0.01         0       0.377709
2          7      75          1          2    4000.00         0       0.052396
3          7      72          0          2    1250.00         0       0.144857
4          7      43          1          3       0.01         0       0.189076
..       ...     ...        ...        ...        ...       ...            ...
145        3      59          0          0   12000.00         0       0.069240
146        8      69          1          0   14400.00         0       0.052396
147        7      27          0          3       0.01         1       0.528881
148       10      63          1          2    3000.00         0       0.127919
149       18      41          0          3       0.01         1       0.809524

[150 rows x 7 columns]
In [ ]:
# Make the new dataframe with the predictions and probabilities into an Excel file
scored_file = 'NewOrganicCustomerScores.xlsx'
dfnew.to_excel(scored_file)

Exploratory Analysis¶

In [66]:
#Reload the data into a dataframe called df_analysis to do analysis with the
#original data without any changes made for machine learning
data_path = '/content/drive/MyDrive/organics.csv'

df_analysis = pd.read_csv(data_path)

# preview data
df_analysis.info()
df_analysis.head
print(df_analysis)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22223 entries, 0 to 22222
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               22223 non-null  int64  
 1   DemAffl          21138 non-null  float64
 2   DemAge           20715 non-null  float64
 3   DemCluster       21549 non-null  float64
 4   DemClusterGroup  21549 non-null  object 
 5   DemGender        19711 non-null  object 
 6   DemReg           21758 non-null  object 
 7   DemTVReg         21758 non-null  object 
 8   PromClass        22223 non-null  object 
 9   PromSpend        22223 non-null  float64
 10  PromTime         21942 non-null  float64
 11  TargetBuy        22223 non-null  int64  
 12  TargetAmt        22223 non-null  int64  
dtypes: float64(5), int64(3), object(5)
memory usage: 2.2+ MB
             ID  DemAffl  DemAge  DemCluster DemClusterGroup DemGender  \
0           140     10.0    76.0        16.0               C         U   
1           620      4.0    49.0        35.0               D         U   
2           868      5.0    70.0        27.0               D         F   
3          1120     10.0    65.0        51.0               F         M   
4          2313     11.0    68.0         4.0               A         F   
...         ...      ...     ...         ...             ...       ...   
22218  52834058     13.0    65.0         9.0               B         F   
22219  52834376     15.0    73.0        34.0               D         U   
22220  52837057      9.0    70.0        15.0               B         F   
22221  52838096     11.0    66.0         8.0               B         F   
22222  52856469      3.0    75.0        53.0               F       NaN   

           DemReg      DemTVReg PromClass  PromSpend  PromTime  TargetBuy  \
0        Midlands  Wales & West      Gold   16000.00       4.0          0   
1        Midlands  Wales & West      Gold    6000.00       5.0          0   
2        Midlands  Wales & West    Silver       0.02       8.0          1   
3        Midlands      Midlands       Tin       0.01       7.0          1   
4        Midlands      Midlands       Tin       0.01       8.0          0   
...           ...           ...       ...        ...       ...        ...   
22218  South East        London    Silver    1500.00       5.0          0   
22219  South East    S & S East      Gold    6053.06      12.0          0   
22220       North     Yorkshire      Gold    6000.00       5.0          0   
22221       North        N West    Silver    5000.00       5.0          0   
22222  South West        S West      Gold    6000.00       2.0          0   

       TargetAmt  
0              0  
1              0  
2              1  
3              1  
4              0  
...          ...  
22218          0  
22219          0  
22220          0  
22221          0  
22222          0  

[22223 rows x 13 columns]
In [67]:
#Get the average amount spent, time spent, age, and affluence for each loyalty status
df_analysis.groupby('PromClass')[['PromSpend', 'PromTime', 'DemAge', 'DemAffl']].mean()
Out[67]:
PromSpend PromTime DemAge DemAffl
PromClass
Gold 8795.965194 6.418389 59.553901 8.518170
Platinum 30399.233869 6.974668 62.288973 8.550186
Silver 1992.261697 6.770860 53.652875 8.716769
Tin 0.010000 6.379727 47.222038 8.916815
In [68]:
#Get the average amount spent, time spent, age, and affluence for each loyalty status
#for customers 40 years old and younger
df_analysis[df_analysis['DemAge'] <= 40].groupby('PromClass')[['PromSpend', 'PromTime', 'DemAffl']].mean()
Out[68]:
PromSpend PromTime DemAffl
PromClass
Gold 8157.519794 4.615925 9.995192
Platinum 35729.350294 5.312500 9.343750
Silver 1502.637701 5.120321 9.754848
Tin 0.010000 5.862756 9.815940
In [69]:
#Get the minimum amount spent, time spent, age, and affluence for each loyalty status
df_analysis.groupby('PromClass')[['PromSpend', 'PromTime', 'DemAffl']].min()
Out[69]:
PromSpend PromTime DemAffl
PromClass
Gold 5000.01 0.0 0.0
Platinum 20000.01 0.0 2.0
Silver 0.02 0.0 0.0
Tin 0.01 0.0 0.0
In [70]:
#Get the maximum amount spent, time spent, age, and affluence for each loyalty status
df_analysis.groupby('PromClass')[['PromSpend', 'PromTime', 'DemAffl']].max()
Out[70]:
PromSpend PromTime DemAffl
PromClass
Gold 20000.00 38.0 29.0
Platinum 296313.85 32.0 21.0
Silver 5000.00 39.0 34.0
Tin 0.01 35.0 28.0
In [71]:
#Determine how many customers are in each loyalty status
df_analysis.groupby('PromClass')['ID'].count()
Out[71]:
PromClass
Gold        6324
Platinum     840
Silver      8572
Tin         6487
Name: ID, dtype: int64
In [72]:
#Get the percentage of customers who did and did not buy organic products for
#each loyalty status
promclass_buy = df_analysis.groupby(["PromClass", "TargetBuy"])["ID"].count() / df_analysis.groupby(["PromClass"])["ID"].count()
promclass_buy
Out[72]:
PromClass  TargetBuy
Gold       0            0.804554
           1            0.195446
Platinum   0            0.847619
           1            0.152381
Silver     0            0.753616
           1            0.246384
Tin        0            0.687221
           1            0.312779
Name: ID, dtype: float64
In [73]:
#Only get the percentage of customers who did buy organic products for
#each loyalty status, keep column names accurate and make into a dataframe
promclass_buy = promclass_buy.reset_index()
promclass_buy = promclass_buy[promclass_buy['TargetBuy'] == 1].drop('TargetBuy', axis=1)
promclass_buy = promclass_buy.rename(columns={'ID': 'Percentage'})
promclass_buy
Out[73]:
PromClass Percentage
1 Gold 0.195446
3 Platinum 0.152381
5 Silver 0.246384
7 Tin 0.312779
In [74]:
#Plot the percentage of customers who did buy organic products for each loyalty status
plt.figure(figsize=(20, 10)) #Have a larger size for presentations
plot = sns.barplot(data=promclass_buy,  #Color the platinum and tin bars and make the other bars grey to point out the difference between the highest and lowest loyalty statuses. Order the bars from lowest to highest percentage
              x='PromClass', y='Percentage', palette=['peru' if (x == 'Platinum' or x == 'Tin') else 'lightgrey' for x in promclass_buy.sort_values('Percentage')['PromClass']], order=promclass_buy.sort_values('Percentage')['PromClass'])
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
#Give the column chart a title
plt.title("Percentage of Customers Buying Organic Products by Loyalty Class", fontsize = 20)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)

sns.despine() #Do not have a top and right side border

# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the percentage of customers who brought organic
# products in that loyalty status
for p in plot.patches:
 # get the height of each bar
 height = p.get_height()
 # adding text to each bar
 plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
 y = height+0.02, # y-coordinate position of data label, padded 0.02 above bar
 s = '{:.0%}'.format(height), # data label format of the percentage
 fontsize = 14,
 ha = 'center') # sets horizontal alignment (ha) to center


plt.show()
In [75]:
#Get the percentage of customers who brought two or more organic products in each loyalty status
promclass_buy2 = df_analysis[df_analysis["TargetAmt"] >= 2].groupby(["PromClass"])["ID"].count() / df_analysis[df_analysis["TargetBuy"] == 1].groupby(["PromClass"])["ID"].count()
promclass_buy2
Out[75]:
PromClass
Gold        0.135922
Platinum    0.062500
Silver      0.158144
Tin         0.182356
Name: ID, dtype: float64
In [76]:
#Make into dataframe and keep column names accurate
promclass_buy2 = promclass_buy2.reset_index()
promclass_buy2 = promclass_buy2.rename(columns={'ID': 'Percentage'})
promclass_buy2
Out[76]:
PromClass Percentage
0 Gold 0.135922
1 Platinum 0.062500
2 Silver 0.158144
3 Tin 0.182356
In [77]:
#Plot the percentage of customers who brought two or more organic products for each loyalty status
plt.figure(figsize=(20, 10))
plot = sns.barplot(data=promclass_buy2,
              x='PromClass', y='Percentage', color='lightblue')
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Give the column chart a title
plt.title("Percentage of Customers Buying Two or More Organic Products by Loyalty Class", fontsize = 20)
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)

sns.despine() #Do not have a top and right side border

# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the percentage of customers who brought two or
# more organic products in that loyalty status
for p in plot.patches:
 # get the height of each bar
 height = p.get_height()
 # adding text to each bar
 plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
 y = height+0.02, # y-coordinate position of data label, padded 0.02 above bar
 s = '{:.0%}'.format(height), # data label format of the percentage
 fontsize = 14,
 ha = 'center') # sets horizontal alignment (ha) to center


plt.show()
In [78]:
#Get the percentage of customers who brought three organic products in each loyalty status
promclass_buy3 = df_analysis[df_analysis["TargetAmt"] >= 3].groupby(["PromClass"])["ID"].count() / df_analysis[df_analysis["TargetBuy"] == 1].groupby(["PromClass"])["ID"].count()
promclass_buy3
Out[78]:
PromClass
Gold        0.018608
Platinum    0.007812
Silver      0.033144
Tin         0.034993
Name: ID, dtype: float64
In [79]:
#Make into dataframe and keep column names accurate
promclass_buy3 = promclass_buy3.reset_index()
promclass_buy3 = promclass_buy3.rename(columns={'ID': 'Percentage'})
promclass_buy3
Out[79]:
PromClass Percentage
0 Gold 0.018608
1 Platinum 0.007812
2 Silver 0.033144
3 Tin 0.034993
In [80]:
#Plot the percentage of customers who brought three organic products for each loyalty status
plt.figure(figsize=(20, 10))
plot = sns.barplot(data=promclass_buy3,
              x='PromClass', y='Percentage', color='lightblue')
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
#Give the column chart a title
plt.title("Percentage of Customers Buying Three Organic Products by Loyalty Class", fontsize = 20)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)

sns.despine() #Do not have a top and right side border

# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the percentage of customers who brought three organic
# products in that loyalty status
for p in plot.patches:
 # get the height of each bar
 height = p.get_height()
 # adding text to each bar
 plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
 y = height+0.02, # y-coordinate position of data label, padded 0.02 above bar
 s = '{:.0%}'.format(height), # data label format of the percentage
 fontsize = 14,
 ha = 'center') # sets horizontal alignment (ha) to center


plt.show()
In [81]:
#Get the average age of customers for each loyalty status
promclass_age = df_analysis.groupby('PromClass')['DemAge'].mean().reset_index()
promclass_age
Out[81]:
PromClass DemAge
0 Gold 59.553901
1 Platinum 62.288973
2 Silver 53.652875
3 Tin 47.222038
In [82]:
#Plot the average age of customers for each loyalty status
plt.figure(figsize=(20, 10)) #Have a larger size for presentations
plot = sns.barplot(data=promclass_age,  #Color the platinum and tin bars and make the other bars grey to point out the difference between the highest and lowest loyalty statuses. Order the bars from lowest to highest average age
              x='PromClass', y='DemAge', palette=['peru' if (x == 'Platinum' or x == 'Tin') else 'lightgrey' for x in promclass_age.sort_values('DemAge')['PromClass']], order=promclass_age.sort_values('DemAge')['PromClass'])
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Give the column chart a title
plt.title("Average Age of Customers by Loyalty Class", fontsize = 20)
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)

sns.despine() #Do not have a top and right side border

# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the average age of the customers in that loyalty status
for p in plot.patches:
 # get the height of each bar
 height = p.get_height()
 # adding text to each bar
 plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
 y = height+1, # y-coordinate position of data label, padded 1 above bar
 s = '{:.0f}'.format(height), # data label format of the average age
 fontsize = 14,
 ha = 'center') # sets horizontal alignment (ha) to center


plt.show()
In [83]:
#Get the percentage of customers who did and did not buy organic products by gender
gender_buy = df_analysis.groupby(["DemGender", "TargetBuy"])["ID"].count() / df_analysis.groupby(["DemGender"])["ID"].count()
gender_buy
Out[83]:
DemGender  TargetBuy
F          0            0.653881
           1            0.346119
M          0            0.833878
           1            0.166122
U          0            0.939897
           1            0.060103
Name: ID, dtype: float64
In [84]:
#Only get the percentage of customers who did buy organic products by gender,
#keep the column names accurate and make into a dataframe
gender_buy = gender_buy.reset_index()
gender_buy = gender_buy[gender_buy['TargetBuy'] == 1].drop('TargetBuy', axis=1)
gender_buy = gender_buy.replace({'F': 'Women', 'M': 'Men', 'U': 'Unspecified'}) #Replace one letter with words for genders
gender_buy = gender_buy.rename(columns={'ID': 'Percentage'})
gender_buy
Out[84]:
DemGender Percentage
1 Women 0.346119
3 Men 0.166122
5 Unspecified 0.060103
In [85]:
#Plot the percentage of customers who did buy organic products by gender
plt.figure(figsize=(20, 10))
plot = sns.barplot(data=gender_buy,          #Color the bar for women blue and make the others grey to point out how women are more likely to buy organic products
              x='DemGender', y='Percentage', palette=['blue' if x == 'Women' else 'lightgrey' for x in gender_buy['DemGender']])
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
#Give the column chart a title
plt.title("Percentage of Customers Buying Organic Products by Gender", fontsize = 20)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)

sns.despine() #Do not have a top and right side border

# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the percentage of customers who brought organic
# products for that gender
for p in plot.patches:
 # get the height of each bar
 height = p.get_height()
 # adding text to each bar
 plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
 y = height+0.02, # y-coordinate position of data label, padded 0.02 above bar
 s = '{:.0%}'.format(height), # data label format of the percentage
 fontsize = 14,
 ha = 'center') # sets horizontal alignment (ha) to center


plt.show()
In [86]:
#Get the percentage of customers who brought two or more organic products by gender
gender_buy2 = df_analysis[df_analysis["TargetAmt"] >= 2].groupby(["DemGender"])["ID"].count() / df_analysis[df_analysis["TargetBuy"] == 1].groupby(["DemGender"])["ID"].count()
gender_buy2
Out[86]:
DemGender
F    0.158383
M    0.150104
U    0.133333
Name: ID, dtype: float64
In [87]:
#Make into dataframe and keep column names accurate
gender_buy2 = gender_buy2.reset_index()
gender_buy2 = gender_buy2.replace({'F': 'Women', 'M': 'Men', 'U': 'Unspecified'}) #Replace one letter with words for genders
gender_buy2 = gender_buy2.rename(columns={'ID': 'Percentage'})
gender_buy2
Out[87]:
DemGender Percentage
0 Women 0.158383
1 Men 0.150104
2 Unspecified 0.133333
In [88]:
#Plot the percentage of customers who brought two or more organic products by gender
plt.figure(figsize=(20, 5)) #Make half length to take up half of slide
plot = sns.barplot(data=gender_buy2,
              x='DemGender', y='Percentage', color='lightblue')
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
#Give the column chart a title
plt.title("2 Or More Products", fontsize=16)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)

sns.despine() #Do not have a top and right side border

# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the percentage of customers who brought two or more organic
# products for that gender
for p in plot.patches:
 # get the height of each bar
 height = p.get_height()
 # adding text to each bar
 plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
 y = height+0.02, # y-coordinate position of data label, padded 0.02 above bar
 s = '{:.0%}'.format(height), # data label format of the percentage
 fontsize = 14,
 ha = 'center') # sets horizontal alignment (ha) to center


plt.show()
In [89]:
#Get the percentage of customers who brought three organic products by gender
gender_buy3 = df_analysis[df_analysis["TargetAmt"] >= 3].groupby(["DemGender"])["ID"].count() / df_analysis[df_analysis["TargetBuy"] == 1].groupby(["DemGender"])["ID"].count()
gender_buy3
Out[89]:
DemGender
F    0.030440
M    0.027950
U    0.019048
Name: ID, dtype: float64
In [90]:
#Make into dataframe and keep column names accurate
gender_buy3 = gender_buy3.reset_index()
gender_buy3 = gender_buy3.replace({'F': 'Women', 'M': 'Men', 'U': 'Unspecified'}) #Replace one letter with words for genders
gender_buy3 = gender_buy3.rename(columns={'ID': 'Percentage'})
gender_buy3
Out[90]:
DemGender Percentage
0 Women 0.030440
1 Men 0.027950
2 Unspecified 0.019048
In [91]:
#Plot the percentage of customers who brought three organic products by gender
plt.figure(figsize=(20, 5)) #Make half length to take up half of slide
plot = sns.barplot(data=gender_buy3,
              x='DemGender', y='Percentage', color='lightblue')
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
#Give the column chart a title
plt.title("3 Products", fontsize=16)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)

sns.despine() #Do not have a top and right side border

# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the percentage of customers who brought three organic
# products for that gender
for p in plot.patches:
 # get the height of each bar
 height = p.get_height()
 # adding text to each bar
 plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
 y = height+0.02, # y-coordinate position of data label, padded 0.02 above bar
 s = '{:.0%}'.format(height), # data label format of the percentage
 fontsize = 14,
 ha = 'center') # sets horizontal alignment (ha) to center


plt.show()
In [92]:
#Get the percentage of customers who did and did not buy organic products by age
age_buy = df_analysis.groupby(["DemAge", "TargetBuy"])["ID"].count() / df_analysis.groupby(["DemAge"])["ID"].count()
age_buy
Out[92]:
DemAge  TargetBuy
18.0    0            1.000000
19.0    0            1.000000
20.0    0            0.300000
        1            0.700000
21.0    0            0.500000
                       ...   
77.0    1            0.192593
78.0    0            0.824390
        1            0.175610
79.0    0            0.825503
        1            0.174497
Name: ID, Length: 122, dtype: float64
In [93]:
#Only get the percentage of customers who did buy organic products by age,
#keep the column names accurate and make into a dataframe
age_buy = age_buy.reset_index()
age_buy = age_buy[age_buy['TargetBuy'] == 1].drop('TargetBuy', axis=1)
age_buy = age_buy.rename(columns={'ID': 'Percentage'})
age_buy
Out[93]:
DemAge Percentage
3 20.0 0.700000
5 21.0 0.500000
7 22.0 0.611111
9 23.0 0.652174
11 24.0 0.714286
13 25.0 0.543478
15 26.0 0.638298
17 27.0 0.671053
19 28.0 0.509615
21 29.0 0.626016
23 30.0 0.559524
25 31.0 0.530806
27 32.0 0.645570
29 33.0 0.542662
31 34.0 0.621053
33 35.0 0.594828
35 36.0 0.593373
37 37.0 0.615764
39 38.0 0.580838
41 39.0 0.626087
43 40.0 0.382199
45 41.0 0.396947
47 42.0 0.410526
49 43.0 0.403226
51 44.0 0.408889
53 45.0 0.042316
55 46.0 0.171367
57 47.0 0.156604
59 48.0 0.138258
61 49.0 0.144250
63 50.0 0.131757
65 51.0 0.155738
67 52.0 0.153704
69 53.0 0.135940
71 54.0 0.155340
73 55.0 0.164356
75 56.0 0.162946
77 57.0 0.132479
79 58.0 0.153374
81 59.0 0.150980
83 60.0 0.131429
85 61.0 0.174797
87 62.0 0.150327
89 63.0 0.135560
91 64.0 0.156522
93 65.0 0.137860
95 66.0 0.133047
97 67.0 0.124717
99 68.0 0.190931
101 69.0 0.154450
103 70.0 0.148541
105 71.0 0.141243
107 72.0 0.133127
109 73.0 0.142857
111 74.0 0.141538
113 75.0 0.169811
115 76.0 0.135762
117 77.0 0.192593
119 78.0 0.175610
121 79.0 0.174497
In [94]:
#Plot the percentage of customers who did buy organic products by age
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=age_buy,
              x='DemAge', y='Percentage', linewidth=3)
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
#Give the line graph a title
plt.title("Age of Customers and Percentage Buying Organic Products", fontsize = 20)
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
plt.xlim(19,80) #Have the ages on the x-axis be from 19 to 80 years old
sns.despine() #Do not have a top and right side border
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis


plt.show()
In [95]:
#Get the percentage of customers who brought two or more organic products by age
age_buy2 = df_analysis[df_analysis["TargetAmt"] >= 2].groupby(["DemAge"])["ID"].count() / df_analysis[df_analysis["TargetBuy"] == 1].groupby(["DemAge"])["ID"].count()
age_buy2
Out[95]:
DemAge
20.0         NaN
21.0    0.500000
22.0    0.272727
23.0    0.133333
24.0    0.133333
25.0    0.200000
26.0    0.133333
27.0    0.098039
28.0    0.094340
29.0    0.129870
30.0    0.191489
31.0    0.178571
32.0    0.143791
33.0    0.213836
34.0    0.209040
35.0    0.188406
36.0    0.203046
37.0    0.172000
38.0    0.159794
39.0    0.143519
40.0    0.068493
41.0    0.173077
42.0    0.089744
43.0    0.091429
44.0    0.130435
45.0    1.000000
46.0    0.303797
47.0    0.228916
48.0    0.356164
49.0    0.310811
50.0    0.346154
51.0    0.378947
52.0    0.289157
53.0    0.397260
54.0    0.287500
55.0    0.253012
56.0    0.260274
57.0    0.338710
58.0    0.320000
59.0    0.363636
60.0         NaN
61.0         NaN
62.0         NaN
63.0         NaN
64.0         NaN
65.0         NaN
66.0         NaN
67.0         NaN
68.0         NaN
69.0         NaN
70.0         NaN
71.0         NaN
72.0         NaN
73.0         NaN
74.0         NaN
75.0         NaN
76.0         NaN
77.0         NaN
78.0         NaN
79.0         NaN
Name: ID, dtype: float64
In [96]:
#Make into dataframe and keep column names accurate
age_buy2 = age_buy2.reset_index()
age_buy2 = age_buy2.rename(columns={'ID': 'Percentage'})
age_buy2['Percentage'].fillna(value=0, inplace=True) #Replace null with 0%
age_buy2
Out[96]:
DemAge Percentage
0 20.0 0.000000
1 21.0 0.500000
2 22.0 0.272727
3 23.0 0.133333
4 24.0 0.133333
5 25.0 0.200000
6 26.0 0.133333
7 27.0 0.098039
8 28.0 0.094340
9 29.0 0.129870
10 30.0 0.191489
11 31.0 0.178571
12 32.0 0.143791
13 33.0 0.213836
14 34.0 0.209040
15 35.0 0.188406
16 36.0 0.203046
17 37.0 0.172000
18 38.0 0.159794
19 39.0 0.143519
20 40.0 0.068493
21 41.0 0.173077
22 42.0 0.089744
23 43.0 0.091429
24 44.0 0.130435
25 45.0 1.000000
26 46.0 0.303797
27 47.0 0.228916
28 48.0 0.356164
29 49.0 0.310811
30 50.0 0.346154
31 51.0 0.378947
32 52.0 0.289157
33 53.0 0.397260
34 54.0 0.287500
35 55.0 0.253012
36 56.0 0.260274
37 57.0 0.338710
38 58.0 0.320000
39 59.0 0.363636
40 60.0 0.000000
41 61.0 0.000000
42 62.0 0.000000
43 63.0 0.000000
44 64.0 0.000000
45 65.0 0.000000
46 66.0 0.000000
47 67.0 0.000000
48 68.0 0.000000
49 69.0 0.000000
50 70.0 0.000000
51 71.0 0.000000
52 72.0 0.000000
53 73.0 0.000000
54 74.0 0.000000
55 75.0 0.000000
56 76.0 0.000000
57 77.0 0.000000
58 78.0 0.000000
59 79.0 0.000000
In [97]:
#Plot the percentage of customers who brought two or more organic products by age
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=age_buy2,
              x='DemAge', y='Percentage', linewidth=3)
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
plt.xlim(19,80) #Have the ages on the x-axis be from 19 to 80 years old
#Give the line graph a title
plt.title("Age of Customers Buying Organic Products and Percentage Buying Two Or More Organic Products", fontsize = 20)
sns.despine() #Do not have a top and right side border
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis


plt.show()
In [98]:
#Get the percentage of customers who brought three organic products by age
age_buy3 = df_analysis[df_analysis["TargetAmt"] >= 3].groupby(["DemAge"])["ID"].count() / df_analysis[df_analysis["TargetBuy"] == 1].groupby(["DemAge"])["ID"].count()
age_buy3
Out[98]:
DemAge
20.0         NaN
21.0         NaN
22.0    0.181818
23.0    0.066667
24.0    0.133333
25.0    0.080000
26.0    0.066667
27.0    0.019608
28.0    0.056604
29.0    0.051948
30.0    0.063830
31.0    0.053571
32.0    0.078431
33.0    0.106918
34.0    0.067797
35.0    0.082126
36.0    0.091371
37.0    0.080000
38.0    0.082474
39.0    0.055556
40.0         NaN
41.0         NaN
42.0         NaN
43.0         NaN
44.0         NaN
45.0         NaN
46.0         NaN
47.0         NaN
48.0         NaN
49.0         NaN
50.0         NaN
51.0         NaN
52.0         NaN
53.0         NaN
54.0         NaN
55.0         NaN
56.0         NaN
57.0         NaN
58.0         NaN
59.0         NaN
60.0         NaN
61.0         NaN
62.0         NaN
63.0         NaN
64.0         NaN
65.0         NaN
66.0         NaN
67.0         NaN
68.0         NaN
69.0         NaN
70.0         NaN
71.0         NaN
72.0         NaN
73.0         NaN
74.0         NaN
75.0         NaN
76.0         NaN
77.0         NaN
78.0         NaN
79.0         NaN
Name: ID, dtype: float64
In [99]:
#Make into dataframe and keep column names accurate
age_buy3 = age_buy3.reset_index()
age_buy3 = age_buy3.rename(columns={'ID': 'Percentage'})
age_buy3['Percentage'].fillna(value=0, inplace=True) #Replace null with 0%
age_buy3
Out[99]:
DemAge Percentage
0 20.0 0.000000
1 21.0 0.000000
2 22.0 0.181818
3 23.0 0.066667
4 24.0 0.133333
5 25.0 0.080000
6 26.0 0.066667
7 27.0 0.019608
8 28.0 0.056604
9 29.0 0.051948
10 30.0 0.063830
11 31.0 0.053571
12 32.0 0.078431
13 33.0 0.106918
14 34.0 0.067797
15 35.0 0.082126
16 36.0 0.091371
17 37.0 0.080000
18 38.0 0.082474
19 39.0 0.055556
20 40.0 0.000000
21 41.0 0.000000
22 42.0 0.000000
23 43.0 0.000000
24 44.0 0.000000
25 45.0 0.000000
26 46.0 0.000000
27 47.0 0.000000
28 48.0 0.000000
29 49.0 0.000000
30 50.0 0.000000
31 51.0 0.000000
32 52.0 0.000000
33 53.0 0.000000
34 54.0 0.000000
35 55.0 0.000000
36 56.0 0.000000
37 57.0 0.000000
38 58.0 0.000000
39 59.0 0.000000
40 60.0 0.000000
41 61.0 0.000000
42 62.0 0.000000
43 63.0 0.000000
44 64.0 0.000000
45 65.0 0.000000
46 66.0 0.000000
47 67.0 0.000000
48 68.0 0.000000
49 69.0 0.000000
50 70.0 0.000000
51 71.0 0.000000
52 72.0 0.000000
53 73.0 0.000000
54 74.0 0.000000
55 75.0 0.000000
56 76.0 0.000000
57 77.0 0.000000
58 78.0 0.000000
59 79.0 0.000000
In [100]:
#Plot the percentage of customers who brought three organic products by age
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=age_buy3,
              x='DemAge', y='Percentage', linewidth=3)
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
plt.xlim(19,80) #Have the ages on the x-axis be from 19 to 80 years old
#Give the line graph a title
plt.title("Age of Customers Buying Organic Products and Percentage Buying Three Organic Products", fontsize = 20)
sns.despine() #Do not have a top and right side border
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis


plt.show()
In [101]:
#Get the percentage of customers who did and did not buy organic products by affluence grade
affl_buy = df_analysis.groupby(["DemAffl", "TargetBuy"])["ID"].count() / df_analysis.groupby(["DemAffl"])["ID"].count()
affl_buy
Out[101]:
DemAffl  TargetBuy
0.0      0            1.000000
1.0      0            1.000000
2.0      0            0.950000
         1            0.050000
3.0      0            0.915572
         1            0.084428
4.0      0            0.924720
         1            0.075280
5.0      0            0.896510
         1            0.103490
6.0      0            0.864369
         1            0.135631
7.0      0            0.846600
         1            0.153400
8.0      0            0.810934
         1            0.189066
9.0      0            0.765867
         1            0.234133
10.0     0            0.730203
         1            0.269797
11.0     0            0.681981
         1            0.318019
12.0     0            0.626914
         1            0.373086
13.0     0            0.546729
         1            0.453271
14.0     0            0.474747
         1            0.525253
15.0     0            0.411911
         1            0.588089
16.0     0            0.318367
         1            0.681633
17.0     0            0.275862
         1            0.724138
18.0     0            0.213483
         1            0.786517
19.0     0            0.112676
         1            0.887324
20.0     0            0.016949
         1            0.983051
21.0     1            1.000000
22.0     1            1.000000
23.0     1            1.000000
24.0     1            1.000000
25.0     1            1.000000
26.0     1            1.000000
27.0     1            1.000000
28.0     1            1.000000
29.0     1            1.000000
30.0     1            1.000000
31.0     1            1.000000
34.0     1            1.000000
Name: ID, dtype: float64
In [102]:
#Only get the percentage of customers who did buy organic products by affluence grade,
#keep the column names accurate and make into a dataframe
affl_buy = affl_buy.reset_index()
affl_buy = affl_buy[affl_buy['TargetBuy'] == 1].drop('TargetBuy', axis=1)
affl_buy = affl_buy.rename(columns={'ID': 'Percentage'})
affl_buy
Out[102]:
DemAffl Percentage
3 2.0 0.050000
5 3.0 0.084428
7 4.0 0.075280
9 5.0 0.103490
11 6.0 0.135631
13 7.0 0.153400
15 8.0 0.189066
17 9.0 0.234133
19 10.0 0.269797
21 11.0 0.318019
23 12.0 0.373086
25 13.0 0.453271
27 14.0 0.525253
29 15.0 0.588089
31 16.0 0.681633
33 17.0 0.724138
35 18.0 0.786517
37 19.0 0.887324
39 20.0 0.983051
40 21.0 1.000000
41 22.0 1.000000
42 23.0 1.000000
43 24.0 1.000000
44 25.0 1.000000
45 26.0 1.000000
46 27.0 1.000000
47 28.0 1.000000
48 29.0 1.000000
49 30.0 1.000000
50 31.0 1.000000
51 34.0 1.000000
In [103]:
#Plot the percentage of customers who did buy organic products by affluence grade
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=affl_buy,
              x='DemAffl', y='Percentage', linewidth=3)
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
#Give the line graph a title
plt.title("Affluence Grade of Customers and Percentage Buying Organic Products", fontsize = 20)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis


plt.show()
In [104]:
#Get the percentage of customers who brought two or more organic products by affluence grade
affl_buy2 = df_analysis[df_analysis["TargetAmt"] >= 2].groupby(["DemAffl"])["ID"].count() / df_analysis[df_analysis["TargetBuy"] == 1].groupby(["DemAffl"])["ID"].count()
affl_buy2
Out[104]:
DemAffl
2.0          NaN
3.0          NaN
4.0          NaN
5.0          NaN
6.0     0.009901
7.0     0.017632
8.0     0.018072
9.0     0.049587
10.0    0.071672
11.0    0.108818
12.0    0.125270
13.0    0.208763
14.0    0.259615
15.0    0.337553
16.0    0.335329
17.0    0.600000
18.0    0.642857
19.0    0.825397
20.0    0.862069
21.0    0.861111
22.0    0.920000
23.0    1.000000
24.0    1.000000
25.0    1.000000
26.0    1.000000
27.0    1.000000
28.0    1.000000
29.0    1.000000
30.0    1.000000
31.0    1.000000
34.0    1.000000
Name: ID, dtype: float64
In [105]:
#Make into dataframe and keep column names accurate
affl_buy2 = affl_buy2.reset_index()
affl_buy2 = affl_buy2.rename(columns={'ID': 'Percentage'}) #Replace null with 0%
affl_buy2['Percentage'].fillna(value=0, inplace=True)
affl_buy2
Out[105]:
DemAffl Percentage
0 2.0 0.000000
1 3.0 0.000000
2 4.0 0.000000
3 5.0 0.000000
4 6.0 0.009901
5 7.0 0.017632
6 8.0 0.018072
7 9.0 0.049587
8 10.0 0.071672
9 11.0 0.108818
10 12.0 0.125270
11 13.0 0.208763
12 14.0 0.259615
13 15.0 0.337553
14 16.0 0.335329
15 17.0 0.600000
16 18.0 0.642857
17 19.0 0.825397
18 20.0 0.862069
19 21.0 0.861111
20 22.0 0.920000
21 23.0 1.000000
22 24.0 1.000000
23 25.0 1.000000
24 26.0 1.000000
25 27.0 1.000000
26 28.0 1.000000
27 29.0 1.000000
28 30.0 1.000000
29 31.0 1.000000
30 34.0 1.000000
In [106]:
#Plot the percentage of customers who brought two or more organic products by affluence grade
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=affl_buy2,
              x='DemAffl', y='Percentage', linewidth=3)
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
#Give the line graph a title
plt.title("Affluence Grade of Customers Buying Organic Products and Percentage Buying Two or More Organic Products", fontsize = 20)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis


plt.show()
In [107]:
#Get the percentage of customers who brought three organic products by age
affl_buy3 = df_analysis[df_analysis["TargetAmt"] >= 3].groupby(["DemAffl"])["ID"].count() / df_analysis[df_analysis["TargetBuy"] == 1].groupby(["DemAffl"])["ID"].count()
affl_buy3
Out[107]:
DemAffl
2.0          NaN
3.0          NaN
4.0          NaN
5.0          NaN
6.0          NaN
7.0          NaN
8.0          NaN
9.0          NaN
10.0         NaN
11.0    0.003752
12.0    0.010799
13.0    0.012887
14.0    0.025641
15.0    0.046414
16.0    0.029940
17.0    0.085714
18.0    0.128571
19.0    0.269841
20.0    0.327586
21.0    0.333333
22.0    0.440000
23.0    0.411765
24.0    0.266667
25.0    1.000000
26.0    0.500000
27.0    1.000000
28.0    1.000000
29.0    1.000000
30.0    1.000000
31.0    1.000000
34.0    1.000000
Name: ID, dtype: float64
In [108]:
#Make into dataframe and keep column names accurate
affl_buy3 = affl_buy3.reset_index()
affl_buy3 = affl_buy3.rename(columns={'ID': 'Percentage'})
affl_buy3['Percentage'].fillna(value=0, inplace=True) #Replace null with 0%
affl_buy3
Out[108]:
DemAffl Percentage
0 2.0 0.000000
1 3.0 0.000000
2 4.0 0.000000
3 5.0 0.000000
4 6.0 0.000000
5 7.0 0.000000
6 8.0 0.000000
7 9.0 0.000000
8 10.0 0.000000
9 11.0 0.003752
10 12.0 0.010799
11 13.0 0.012887
12 14.0 0.025641
13 15.0 0.046414
14 16.0 0.029940
15 17.0 0.085714
16 18.0 0.128571
17 19.0 0.269841
18 20.0 0.327586
19 21.0 0.333333
20 22.0 0.440000
21 23.0 0.411765
22 24.0 0.266667
23 25.0 1.000000
24 26.0 0.500000
25 27.0 1.000000
26 28.0 1.000000
27 29.0 1.000000
28 30.0 1.000000
29 31.0 1.000000
30 34.0 1.000000
In [109]:
#Plot the percentage of customers who brought three organic products by affluence grade
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=affl_buy3,
              x='DemAffl', y='Percentage', linewidth=3)
plot.yaxis.set_major_formatter(PercentFormatter(1)) #Make y-axis of percentages
plt.ylim(0,1) #Have the percentages on the y-axis be from 0% to 100%
#Give the line graph a title
plt.title("Affluence Grade of Customers Buying Organic Products and Percentage Buying Three Organic Products", fontsize = 20)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis


plt.show()
In [110]:
#Get average amount spent by affluence grade
affl_spend = df_analysis.groupby('DemAffl')['PromSpend'].mean().reset_index()
affl_spend
Out[110]:
DemAffl PromSpend
0 0.0 4217.228333
1 1.0 4106.715833
2 2.0 4131.813700
3 3.0 4833.379231
4 4.0 4163.785941
5 5.0 4719.320463
6 6.0 4487.887167
7 7.0 4598.021167
8 8.0 4268.491409
9 9.0 4437.048189
10 10.0 4774.821013
11 11.0 4533.221068
12 12.0 4367.005302
13 13.0 4085.777278
14 14.0 4256.772559
15 15.0 5351.760372
16 16.0 3308.645592
17 17.0 2286.102138
18 18.0 3223.334157
19 19.0 3488.155070
20 20.0 3034.970508
21 21.0 3324.370833
22 22.0 2034.605200
23 23.0 3500.861176
24 24.0 4068.178667
25 25.0 2167.387692
26 26.0 1258.771667
27 27.0 879.598000
28 28.0 0.010000
29 29.0 6166.666667
30 30.0 1453.333333
31 31.0 300.000000
32 34.0 400.000000
In [111]:
#Get the average, minimum, and maximum amount spent by affluence grade
#and the number of customers in each affluence grade
affl_info = df_analysis.groupby('DemAffl')['PromSpend'].agg(['mean', 'min', 'max', 'count']).reset_index()
affl_info
Out[111]:
DemAffl mean min max count
0 0.0 4217.228333 0.01 12000.00 6
1 1.0 4106.715833 0.01 18000.00 36
2 2.0 4131.813700 0.01 24053.35 200
3 3.0 4833.379231 0.01 65053.35 533
4 4.0 4163.785941 0.01 65000.00 983
5 5.0 4719.320463 0.01 94288.31 1662
6 6.0 4487.887167 0.01 97000.00 2234
7 7.0 4598.021167 0.01 81551.03 2588
8 8.0 4268.491409 0.01 100000.00 2634
9 9.0 4437.048189 0.01 110072.44 2584
10 10.0 4774.821013 0.01 296313.85 2172
11 11.0 4533.221068 0.01 78292.50 1676
12 12.0 4367.005302 0.01 95000.00 1241
13 13.0 4085.777278 0.01 120000.00 856
14 14.0 4256.772559 0.01 100000.00 594
15 15.0 5351.760372 0.01 239542.13 403
16 16.0 3308.645592 0.01 48301.78 245
17 17.0 2286.102138 0.01 30937.60 145
18 18.0 3223.334157 0.01 37000.00 89
19 19.0 3488.155070 0.01 47000.00 71
20 20.0 3034.970508 0.01 24106.70 59
21 21.0 3324.370833 0.01 29673.44 36
22 22.0 2034.605200 0.01 12000.00 25
23 23.0 3500.861176 0.01 17553.35 17
24 24.0 4068.178667 0.01 11500.00 15
25 25.0 2167.387692 0.01 6000.00 13
26 26.0 1258.771667 0.01 6000.00 6
27 27.0 879.598000 0.01 3397.96 5
28 28.0 0.010000 0.01 0.01 1
29 29.0 6166.666667 2000.00 13500.00 3
30 30.0 1453.333333 360.00 3000.00 3
31 31.0 300.000000 300.00 300.00 2
32 34.0 400.000000 400.00 400.00 1
In [112]:
#Get the average, minimum, and maximum amount time spent in store by affluence grade
affl_time = df_analysis.groupby('DemAffl')['PromTime'].agg(['mean', 'min', 'max']).reset_index()
affl_time
Out[112]:
DemAffl mean min max
0 0.0 6.166667 4.0 9.0
1 1.0 7.527778 2.0 31.0
2 2.0 7.191919 0.0 32.0
3 3.0 6.717514 0.0 32.0
4 4.0 6.662910 0.0 33.0
5 5.0 6.557927 0.0 33.0
6 6.0 6.677083 0.0 38.0
7 7.0 6.574794 0.0 32.0
8 8.0 6.576819 0.0 36.0
9 9.0 6.543418 0.0 35.0
10 10.0 6.522548 0.0 35.0
11 11.0 6.601567 0.0 33.0
12 12.0 6.465798 0.0 39.0
13 13.0 6.284371 1.0 36.0
14 14.0 6.432479 1.0 32.0
15 15.0 6.275949 0.0 32.0
16 16.0 6.207469 0.0 25.0
17 17.0 6.331034 1.0 27.0
18 18.0 6.551724 1.0 27.0
19 19.0 5.169014 1.0 12.0
20 20.0 5.719298 1.0 17.0
21 21.0 5.500000 1.0 10.0
22 22.0 5.680000 1.0 12.0
23 23.0 4.625000 1.0 16.0
24 24.0 4.785714 1.0 10.0
25 25.0 5.153846 1.0 11.0
26 26.0 5.666667 3.0 13.0
27 27.0 5.200000 3.0 10.0
28 28.0 8.000000 8.0 8.0
29 29.0 4.333333 2.0 9.0
30 30.0 4.500000 2.0 7.0
31 31.0 9.500000 8.0 11.0
32 34.0 9.000000 9.0 9.0
In [113]:
#Plot the average amount of money customers spent in the store this year
#by affluence grade
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=affl_spend,
              x='DemAffl', y='PromSpend', linewidth=3)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Give the line graph a title
plt.title("Affluence Grade of Customers and Average Amount Spent in Store this Year", fontsize = 20)
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis
plot.yaxis.set_major_formatter('${x:1.0f}') #Make the y-axis into dollars


plt.show()
In [114]:
#Plot the average amount of money customers spent in the store this year
#by affluence grades with ten or more customers
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=affl_info[affl_info['count'] >= 10], #Only include affluence grade with ten or more customers
              x='DemAffl', y='mean', linewidth=3)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Give the line graph a title
plt.title("Affluence Grade of Customers and Average Amount Spent in Store this Year", fontsize = 20)
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis
plot.yaxis.set_major_formatter('${x:1.0f}') #Make the y-axis into dollars


plt.show()
In [126]:
#Plot the average amount of money customers spent in the store this year
#by affluence grades with twenty or more customers
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=affl_info[affl_info['count'] >= 20], #Only include affluence grade with twenty or more customers
              x='DemAffl', y='mean', linewidth=3)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
plt.ylim(0,5500) #Have the y-axis be from $0 to $5,500
#Give the line graph a title
plt.title("Affluence Grade of Customers and Average Amount Spent in Store this Year", fontsize = 20)
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis
plot.yaxis.set_major_formatter('${x:1.0f}') #Make the y-axis into dollars


plt.show()
In [116]:
affl_age = df_analysis.groupby('DemAffl')['DemAge'].agg(['mean', 'min', 'max']).reset_index()
affl_age
Out[116]:
DemAffl mean min max
0 0.0 57.600000 41.0 72.0
1 1.0 57.593750 37.0 77.0
2 2.0 56.075269 25.0 79.0
3 3.0 55.392079 18.0 79.0
4 4.0 56.039871 19.0 79.0
5 5.0 55.145631 19.0 79.0
6 6.0 55.127946 18.0 79.0
7 7.0 54.663633 18.0 79.0
8 8.0 54.015789 19.0 79.0
9 9.0 53.894737 19.0 79.0
10 10.0 53.967996 18.0 79.0
11 11.0 53.377007 20.0 79.0
12 12.0 53.117749 22.0 79.0
13 13.0 51.778481 18.0 79.0
14 14.0 51.514493 21.0 79.0
15 15.0 50.501319 22.0 78.0
16 16.0 48.914163 25.0 77.0
17 17.0 47.462121 22.0 76.0
18 18.0 46.953488 24.0 79.0
19 19.0 43.936508 26.0 74.0
20 20.0 41.545455 22.0 77.0
21 21.0 41.242424 29.0 59.0
22 22.0 40.478261 28.0 73.0
23 23.0 38.176471 22.0 58.0
24 24.0 44.133333 27.0 59.0
25 25.0 34.166667 25.0 38.0
26 26.0 41.400000 30.0 55.0
27 27.0 35.400000 30.0 38.0
28 28.0 33.000000 33.0 33.0
29 29.0 32.666667 29.0 35.0
30 30.0 37.666667 36.0 39.0
31 31.0 36.500000 35.0 38.0
32 34.0 38.000000 38.0 38.0
In [117]:
#Plot the average age by affluence grade
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=affl_age,
              x='DemAffl', y='mean', linewidth=3)
#Give the line graph a title
plt.title("Affluence Grade of Customers and Average Age", fontsize = 20)
plt.ylim(0,60) #Have the age on the y-axis be from 0 to 60 years old
#Change the font size of the titles for the x and y-axis
plt.xlabel("Affluence Grade", fontsize=11)
plt.ylabel("Average Age", fontsize=11)
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis


plt.show()
In [118]:
#Get the average amount spent by age
age_spend = df_analysis.groupby('DemAge')['PromSpend'].mean().reset_index()
age_spend
Out[118]:
DemAge PromSpend
0 18.0 2495.000000
1 19.0 1929.041000
2 20.0 3821.196000
3 21.0 2937.500000
4 22.0 2988.736111
... ... ...
57 75.0 7652.286906
58 76.0 8263.677285
59 77.0 7668.657259
60 78.0 7139.649268
61 79.0 7203.240000

62 rows × 2 columns

In [119]:
#Plot the average amount spent this year by age
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=age_spend,
              x='DemAge', y='PromSpend', linewidth=3)
#Give the line graph a title
plt.title("Average Amount Spent this Year by Age", fontsize = 20)
plt.ylim(0,9000) #Have the y-axis be from $0 to $9,000
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis
plot.yaxis.set_major_formatter('${x:1.0f}') #Make the y-axis into dollars


plt.show()
In [120]:
#Plot the number of customers by affluence grade
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=affl_info,
              x='DemAffl', y='count', linewidth=3)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Give the line graph a title
plt.title("Number of Customers by Affluence Grade", fontsize = 20)
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis


plt.show()
In [121]:
#Get the number of customers by age
age_count = df_analysis.groupby('DemAge')['ID'].count().reset_index().rename({'ID': 'Customers'}, axis=1)
age_count
Out[121]:
DemAge Customers
0 18.0 5
1 19.0 10
2 20.0 10
3 21.0 8
4 22.0 18
... ... ...
57 75.0 265
58 76.0 302
59 77.0 270
60 78.0 205
61 79.0 149

62 rows × 2 columns

In [122]:
#Plot the number of customers by age
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=age_count,
              x='DemAge', y='Customers', linewidth=3)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Give the line graph a title
plt.title("Number of Customers by Age", fontsize = 20)
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis


plt.show()
In [123]:
#Plot average spent by affluence
plt.figure(figsize=(20, 10))
plot = sns.lineplot(data=affl_spend,
              x='DemAffl', y='PromSpend', linewidth=3)
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Give the line graph a title
plt.title("Average Spent By Affluence Grade", fontsize = 20)
sns.despine() #Do not have a top and right side border
plt.tick_params(axis='both', which='major', labelsize=11) #Change the font sizes of the ticks on the x and y-axis


plt.show()
In [124]:
#Get the number of customers that purchased one, two, or three organic products
amt_purchased = df_analysis.groupby('TargetAmt')['ID'].count().reset_index()
amt_purchased = amt_purchased.rename(columns={'ID': 'Customers'}) #Keep the column name accurate
amt_purchased.drop(0, inplace=True) #Drop the number of customers who brought no organic products
amt_purchased
Out[124]:
TargetAmt Customers
1 1 4625
2 2 715
3 3 165
In [125]:
#Plot the number of customers who brought one, two, or three organic products
plt.figure(figsize=(20, 10))
plot = sns.barplot(data=amt_purchased,      #Make the number of customers who brought one organic product blue and the others grey to point out how most customers brought one organic product
              x='TargetAmt', y='Customers', palette=['blue' if x == 1 else 'lightgrey' for x in amt_purchased['TargetAmt']])
plot.set(xlabel=None, ylabel=None) #Get rid of the title for the x and y-axis
#Change the font sizes of the ticks on the x and y-axis
plt.tick_params(axis='x', which='major', labelsize=16)
plt.tick_params(axis='y', which='major', labelsize=11)
#Give the column chart a title
plt.title("Number of Customers by Amount of Organic Products Purchased", fontsize = 20)

sns.despine() #Do not have a top and right side border

# From https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9
# label each bar in column chart with the number of customers who brought that
#number of organic products
for p in plot.patches:
 # get the height of each bar
 height = p.get_height()
 # adding text to each bar
 plot.text(x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
 y = height+100, # y-coordinate position of data label, padded 100 above bar
 s = '{:.0f}'.format(height), # data label format for the number of customers
 fontsize = 14,
 ha = 'center') # sets horizontal alignment (ha) to center


plt.show()
In [ ]:
%%shell

jupyter nbconvert --to html "/content/drive/MyDrive/Colab Notebooks/Untitled5.ipynb"